first commit

This commit is contained in:
sam-paech
2025-01-31 18:03:33 +11:00
commit 7b9ed0746f
60 changed files with 111225 additions and 0 deletions
+8
View File
@@ -0,0 +1,8 @@
.env
.vscode
backup
judgemark_v2_runs.json
judgemark_v2.1_runs.json
ministral-repetition-results.json
*.pyc
dev
+55
View File
@@ -0,0 +1,55 @@
"""Global constants and reference scores for the Judgemark-v2 benchmark."""
# Stability test configuration
STABILITY_ITEMS = [
("gemma-2b-it", "2", "28"),
("Llama-3-70b-chat-hf", "1", "42"),
("DeepSeek-R1", "1", "2"),
]
STABILITY_REPS = 100
# Reference scores for correlation
REFERENCE_MODEL_SCORES = {
"DeepSeek-R1": 1430, # !! need to update this
"gpt-4o-2024-11-20": 1402,
"gemini-1.5-pro-002": 1333,
"gemini-1.5-pro-001": 1276,
"claude-3-5-sonnet-20240620": 1243,
"Mistral-Large-Instruct-2411": 1246,
"claude-3-opus-20240229": 1240,
#"c4ai-command-r-plus-08-2024": 1236,
"Llama-3-70b-chat-hf": 1214,
"claude-3-haiku-20240307": 1163,
"c4ai-command-r-08-2024": 1159,
"Mixtral-8x22B-Instruct-v0.1": 1147,
"Mixtral-8x7B-Instruct-v0.1": 1114,
"databricks/dbrx-instruct": 1102,
#"openchat-3.5-1210": 1127,
"gpt-3.5-turbo-0125": 1099,
"Llama-2-13b-chat-hf": 1050,
"gemma-7b-it": 1029,
"gemma-2b-it": 989,
}
# Negative criteria markers for score computation
NEGATIVE_MARKERS = [
"melodramatic", "shallow resolution", "unearned resolution",
"simplistic moralizing", "shallow optimism", "forced optimism",
"trite", "overwrought", "amateurish", "contrived", "uninspiring",
"characters are too good", "incongruent ending positivity",
"unearned transformations", "profundity over-reach",
"amateurish descriptives", "clunky asides", "stilted dialogue",
"tit-for-tat dialogue", "purple prose", "uncreative", "tell-don't-show",
"weak dialogue", "meandering"
]
MODEL_NAME_REPLACEMENTS = {
"mistralai/ministral-3b": "ministral/Ministral-3b-instruct",
"openai/gpt-4o-mini": "gpt-4o-mini",
"deepseek/deepseek-r1": "deepseek-ai/deepseek-r1",
"anthropic/claude-3.5-haiku-20241022": "claude-3.5-haiku-20241022",
"anthropic/claude-3.5-sonnet-20240620": "claude-3.5-sonnet-20240620",
"openai/gpt-4o-2024-11-20": "gpt-4o-2024-11-20",
"deepseek/deepseek-r1-distill-llama-70b": "deepseek-ai/deepseek-r1-distill-llama-70b",
"mistralai/mistral-large-2411": "mistralai/mistral-large-instruct-2411",
}
+514
View File
@@ -0,0 +1,514 @@
import os
import re
import uuid
import time
import signal
import logging
import threading
import concurrent.futures
from datetime import datetime
from typing import Dict, List
from tqdm import tqdm
from collections import defaultdict
from utils.file_io import load_json_file, save_json_file
from utils.api import send_to_judge_model
from utils.visualization import create_side_by_side_score_charts
import statistics
from core.scoring import (
parse_scores, compute_raw_score, compute_detailed_distribution,
compute_model_level_stats, compute_cross_model_stats,
build_landmark_calibration_config, apply_landmark_calibration,
log_score_summary, confidence_interval_95
)
from core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
from core.separability import compute_separability_metrics
from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
from utils.stats import clamp
from utils.state import should_exit, executor
from utils.stats import normalize
def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str,
prompt_template: str, run_key: str, runs: Dict, runs_file: str,
lock: threading.Lock, judge_model: str, save_raw_judge_output: bool):
"""Process a single sample, retrying failed or empty results."""
global should_exit
if should_exit:
return
text_len = len(item_text)
run_data = runs.get(run_key, {})
results = run_data.get("results", {})
model_dict = results.setdefault(model_name, {})
iteration_dict = model_dict.setdefault(iteration_key, {})
existing_item = iteration_dict.get(item_id, {})
if (existing_item and
"aggregated_score_raw" in existing_item and
existing_item.get("parsed_scores") and
len(existing_item["parsed_scores"]) >= 10 and
existing_item["aggregated_score_raw"] > 0.0):
return
try:
final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")
messages = [{"role": "user", "content": final_prompt}]
judge_response = send_to_judge_model(messages, judge_model=judge_model)
extracted_scores = parse_scores(judge_response)
raw_score = compute_raw_score(extracted_scores)
with lock:
storage_dict = {
"parsed_scores": extracted_scores,
"timestamp": datetime.now().isoformat(),
"text_length": text_len
}
if raw_score is not None:
storage_dict["aggregated_score_raw"] = raw_score
if save_raw_judge_output:
storage_dict["judge_response"] = judge_response
iteration_dict[item_id] = storage_dict
runs[run_key]["results"][model_name][iteration_key] = iteration_dict
save_json_file(runs, runs_file)
if raw_score is not None:
logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
else:
logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
except Exception as e:
logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
with lock:
iteration_dict[item_id] = {
"error": str(e),
"timestamp": datetime.now().isoformat()
}
if "errors" not in runs[run_key]:
runs[run_key]["errors"] = []
runs[run_key]["errors"].append({
"model": model_name,
"iteration": iteration_key,
"item_id": item_id,
"error": str(e)
})
save_json_file(runs, runs_file)
def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict):
"""
Compute metrics for both raw and calibrated scores, including stability tests,
normalized components, and detailed distributions.
Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
"""
run_data = runs[run_key]
results = run_data.get("results", {})
# 1. Collect raw scores, compute calibration, store calibrated values
raw_scores_by_model_all = defaultdict(list)
raw_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
calibrated_scores_by_model_all = defaultdict(list)
calibrated_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
lengths_by_model = {}
# -- Collect raw scores
for model_name, iteration_data in results.items():
if not isinstance(iteration_data, dict):
continue
lengths = []
for it_key, it_val in iteration_data.items():
if it_key == "__model_stats__":
continue
if not isinstance(it_val, dict):
continue
for item_id, item_info in it_val.items():
if (isinstance(item_info, dict) and
"aggregated_score_raw" in item_info):
raw_score = item_info["aggregated_score_raw"]
# Collect raw score globally
raw_scores_by_model_all[model_name].append(raw_score)
# Collect raw score by iteration
raw_scores_by_model_by_iter[model_name][it_key].append(raw_score)
# Track text length for analyzing
text = (samples_data.get(model_name, {})
.get("samples", {})
.get(it_key, {})
.get(item_id, ""))
lengths.append(len(text))
if len(raw_scores_by_model_all[model_name]) > 0:
lengths_by_model[model_name] = lengths
# 2. Distribution + calibration
all_raw_scores = [s for scores in raw_scores_by_model_all.values() for s in scores]
run_data["raw_score_distribution"] = compute_detailed_distribution(all_raw_scores)
calibration_config = build_landmark_calibration_config(all_raw_scores, [0, 3, 5, 7, 10])
run_data["calibration_config"] = calibration_config
# Apply calibration
for model_name, iteration_data in results.items():
if not isinstance(iteration_data, dict):
continue
# Flatten model's raw scores, calibrate them
raw_list = raw_scores_by_model_all[model_name]
calibrated = [apply_landmark_calibration(s, calibration_config) for s in raw_list]
# Re-walk iteration_data to assign each calibration back
idx = 0
for it_key, it_val in iteration_data.items():
if it_key == "__model_stats__":
continue
if not isinstance(it_val, dict):
continue
for item_id, item_info in it_val.items():
if (isinstance(item_info, dict) and
"aggregated_score_raw" in item_info):
item_info["aggregated_score_calibrated"] = calibrated[idx]
idx += 1
# Update calibrated_scores_by_model_by_iter in the same breakdown
idx2 = 0
for it_key in raw_scores_by_model_by_iter[model_name]:
count_for_iter = len(raw_scores_by_model_by_iter[model_name][it_key])
these_cals = calibrated[idx2 : idx2 + count_for_iter]
calibrated_scores_by_model_by_iter[model_name][it_key].extend(these_cals)
idx2 += count_for_iter
# Populate the single flattened list of calibrated scores
calibrated_scores_by_model_all[model_name].extend(calibrated)
# 3. Calibrated distributions
all_calibrated_scores = [
s for scores in calibrated_scores_by_model_all.values() for s in scores
]
run_data["calibrated_score_distribution"] = compute_detailed_distribution(all_calibrated_scores)
# 4. Model-level stats
run_data["raw_model_stats"] = compute_model_level_stats(raw_scores_by_model_all, lengths_by_model)
run_data["calibrated_model_stats"] = compute_model_level_stats(calibrated_scores_by_model_all, lengths_by_model)
# 5. Cross-model stats
run_data["raw_cross_model_stats"] = compute_cross_model_stats(
scores_by_model_all=raw_scores_by_model_all,
scores_by_model_by_iter=raw_scores_by_model_by_iter
)
run_data["calibrated_cross_model_stats"] = compute_cross_model_stats(
scores_by_model_all=calibrated_scores_by_model_all,
scores_by_model_by_iter=calibrated_scores_by_model_by_iter
)
# 6. Separability metrics
compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
# 8. Compute iteration stability for raw & calibrated
compute_iteration_stability(run_data, label="raw")
compute_iteration_stability(run_data, label="calibrated")
random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
logging.info("Score stability (RAW)")
logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
logging.info("Score stability (CALIBRATED)")
logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
# -- (A) RAW Judgemark
# Pull out raw stats + separability metrics
raw_stats = run_data["raw_cross_model_stats"]
raw_norm = raw_stats["normalized_components"] # "std_dev", "kw_stat", etc.
# Add your own normalization steps as needed
raw_emd = run_data["separability_metrics"]["raw"]["emd"]["average"]
raw_emd_norm = normalize(raw_emd, 0, 4)
raw_overlap_mag = run_data["separability_metrics"]["raw"]["ci99_overlap_magnitude_sum"]
raw_overlap_mag_norm = normalize(raw_overlap_mag, 0, 26, False)
raw_norm["ci99_overlap_magnitude_sum_norm"] = raw_overlap_mag_norm
# Range of raw model means
raw_score_range = (
max(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
- min(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
)
run_data["raw_score_range"] = raw_score_range
raw_score_range_norm = normalize(raw_score_range, 0, 8)
raw_norm["raw_score_range_norm"] = raw_score_range_norm
# Add Kendall's tau from the randomization-based stability measure
raw_norm["kendall_tau_bootstrapped"] = normalize(random_tau_raw, 0.4, 1.0)
# compute an aggregated separability metric
raw_separability = (
raw_norm["std_dev"] # std deviation *between* models (separability)
+ raw_norm["kw_stat"] # kruskal-wallis (separability)
+ raw_norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
+ raw_norm["raw_score_range_norm"] # range of assigned scores (separability)
+ run_data["separability_metrics"]["raw"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
+ raw_emd_norm # earth-movers distance (separability)
) / 6.0
# Combine into final raw Judgemark
final_score_raw = (
raw_norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
+ raw_norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
+ 4 * raw_separability # aggregate of separability metrics
) / 6.0
run_data["final_judgemark_score_elements_raw"] = {
"norm_stability_between_iterations": raw_norm["kendall_tau_bootstrapped"],
"norm_correlation_with_lmsys_arena": raw_norm["kendall_tau"],
"norm_std_dev_between_models": raw_norm["std_dev"],
"norm_kruskall_wallis": raw_norm["kw_stat"],
"norm_ci99_adjacent_overlap": raw_norm["ci99_overlap_magnitude_sum_norm"],
"norm_score_range": raw_norm["raw_score_range_norm"],
"norm_intra_model_ci95": run_data["separability_metrics"]["raw"]["modulated_ci95"],
"norm_earth_movers_distance": raw_emd_norm
}
run_data["final_judgemark_score_raw"] = final_score_raw
# -- (B) Calibrated Judgemark
cal_stats = run_data["calibrated_cross_model_stats"]
norm = cal_stats["normalized_components"]
emd_norm = normalize(run_data["separability_metrics"]["calibrated"]["emd"]["average"], 0, 4)
overlap_magnitude_norm = normalize(
run_data["separability_metrics"]["calibrated"]["ci99_overlap_magnitude_sum"], 0, 26, False
)
norm["ci99_overlap_magnitude_sum_norm"] = overlap_magnitude_norm
# Range of calibrated model means
calibrated_score_range = (
max(run_data["calibrated_model_stats"][model]["mean"]
for model in run_data["calibrated_model_stats"])
- min(run_data["calibrated_model_stats"][model]["mean"]
for model in run_data["calibrated_model_stats"])
)
run_data["calibrated_score_range"] = calibrated_score_range
calibrated_score_range_norm = normalize(calibrated_score_range, 0, 8)
norm["calibrated_score_range_norm"] = calibrated_score_range_norm
# Kendall's tau from the randomized stability measure
norm["kendall_tau_bootstrapped"] = normalize(random_tau_cal, 0.4, 1.0)
# compute an aggregated separability metric
calibrated_separability = (
norm["std_dev"] # std deviation *between* models (separability)
+ norm["kw_stat"] # kruskal-wallis (separability)
+ norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
+ norm["calibrated_score_range_norm"] # range of assigned scores (separability)
+ run_data["separability_metrics"]["calibrated"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
+ emd_norm # earth-movers distance (separability)
) / 6.0
final_score_calibrated = (
norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
+ norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
+ 4 * calibrated_separability # aggregate of separability metrics
) / 6.0
run_data["final_judgemark_score_elements_calibrated"] = {
"norm_stability_between_iterations": norm["kendall_tau_bootstrapped"],
"norm_correlation_with_lmsys_arena": norm["kendall_tau"],
"norm_std_dev_between_models": norm["std_dev"],
"norm_kruskall_wallis": norm["kw_stat"],
"norm_ci99_adjacent_overlap": norm["ci99_overlap_magnitude_sum_norm"],
"norm_score_range": norm["calibrated_score_range_norm"],
"norm_intra_model_ci95": run_data["separability_metrics"]["calibrated"]["modulated_ci95"],
"norm_earth_movers_distance": norm
}
run_data["final_judgemark_score"] = final_score_calibrated
# 10. Create visualizations + logs
create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data)
log_score_summary(
"RAW SCORES",
run_data["raw_cross_model_stats"],
run_data["raw_model_stats"]
)
log_score_summary(
"CALIBRATED SCORES",
run_data["calibrated_cross_model_stats"],
run_data["calibrated_model_stats"]
)
logging.info(f"Final Judgemark (raw) = {final_score_raw:.3f}")
logging.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}")
def sanitize_model_name(name: str) -> str:
"""Sanitize judge model name for use in the run key."""
return re.sub(r'[^a-zA-Z0-9_-]+', '_', name)
def run_judgemark_v2(
judge_model: str,
samples_file: str,
prompts_file: str,
runs_file: str,
num_threads: int,
run_id: str = None,
save_raw_judge_output: bool = False
) -> str:
global executor, should_exit
logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
runs = load_json_file(runs_file)
# Form the run key using run_id + "__" + sanitized judge model
sanitized_jm = sanitize_model_name(judge_model)
base_id = run_id if run_id else str(uuid.uuid4())
run_key = f"{base_id}__{sanitized_jm}"
# Load data files
samples_data = load_json_file(samples_file)
judge_prompts = load_json_file(prompts_file)
# Initialize or get existing run data
if run_key not in runs:
runs[run_key] = {
"judge_model": judge_model,
"start_time": datetime.now().isoformat(),
"status": "running",
"samples_file": samples_file,
"prompts_file": prompts_file,
"results": {}
}
save_json_file(runs, runs_file)
run_data = runs[run_key]
items_to_process = []
# If run exists, scan for items needing retry
if "results" in run_data:
results = run_data.get("results", {})
# Scan all possible items
for model_name, model_info in samples_data.items():
samples_dict = model_info.get("samples", {})
for iteration_key, iteration_items in samples_dict.items():
for item_id, item_text in iteration_items.items():
# Check if this item needs processing
existing_result = (results.get(model_name, {})
.get(iteration_key, {})
.get(item_id, {}))
needs_retry = (
not existing_result or
not existing_result.get("parsed_scores") or
len(existing_result.get("parsed_scores", {})) < 10 or
existing_result.get("aggregated_score_raw", 0.0) == 0.0 or
"error" in existing_result
)
if needs_retry:
items_to_process.append({
"model_name": model_name,
"iteration_key": iteration_key,
"item_id": item_id,
"item_text": item_text,
"prompt_template": judge_prompts.get(item_id, "")
})
if items_to_process:
logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
else:
logging.info(f"No items to process in existing run {run_key}")
else:
# New run - process all items
for model_name, model_info in samples_data.items():
samples_dict = model_info.get("samples", {})
for iteration_key, iteration_items in samples_dict.items():
print(iteration_key)
for item_id, item_text in iteration_items.items():
items_to_process.append({
"model_name": model_name,
"iteration_key": iteration_key,
"item_id": item_id,
"item_text": item_text,
"prompt_template": judge_prompts.get(item_id, "")
})
# Ensure concurrency lock
lock = threading.Lock()
# Process any items that need retrying
all_futures = []
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
executor = exec_
if items_to_process:
# Process all items (either retries or new run)
for item in items_to_process:
if should_exit:
break
all_futures.append(
executor.submit(
process_sample,
item["model_name"],
item["iteration_key"],
item["item_id"],
item["item_text"],
item["prompt_template"],
run_key,
runs,
runs_file,
lock,
judge_model,
save_raw_judge_output
)
)
# Display progress bar for tasks
for f in tqdm(concurrent.futures.as_completed(all_futures),
total=len(all_futures), desc="Judging", leave=True):
if should_exit:
break
try:
f.result()
except Exception as exc:
logging.error(f"Exception in worker thread: {exc}")
except KeyboardInterrupt:
logging.warning("KeyboardInterrupt caught in main thread.")
should_exit = True
time.sleep(0.1)
finally:
# Mark run as interrupted or completed
status = "interrupted" if should_exit else "completed"
runs[run_key]["status"] = status
runs[run_key]["end_time"] = datetime.now().isoformat()
if not should_exit:
# Run stability test
if False:
run_stability_test(
run_data, judge_model,
judge_prompts, samples_data,
runs, runs_file,
lock, num_threads
)
# Compute final stats
finalize_scores_and_compute_judgemark(runs, run_key, samples_data)
# Save final
save_json_file(runs, runs_file)
if executor:
logging.info("Shutting down executor")
executor.shutdown(wait=False)
executor = None
logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
return run_key
+275
View File
@@ -0,0 +1,275 @@
import math
import statistics
import numpy as np
import scipy.stats
import logging
from typing import Dict, List
import re
from config.constants import REFERENCE_MODEL_SCORES
from utils.stats import normalize
def parse_scores(judge_model_response: str) -> Dict[str,float]:
"""
Extracts zero or more named numeric scores from a text using a simple Regex pattern:
<metric name>: <score>
The metric name can be any string without newlines or colons.
The score can be a positive or negative float or integer.
Example lines in the judge output might be:
"Realism Score: 7.5"
"Melodramatic: 2"
"""
scores = {}
# Look for lines or statements like "Something: 3.5" or "Something Score 3.5"
pattern = r'(.*?):\s*(?:Score\s+)?(-?\d+(?:\.\d+)?)'
matches = re.findall(pattern, judge_model_response)
for match in matches:
metric_name = match[0].strip()
numeric_val = float(match[1])
scores[metric_name] = numeric_val
return scores
def compute_raw_score(scores: Dict[str,float]) -> float:
"""
Given a dict of {criteria: numeric score}, compute a single raw score by adjusting
negative-themed criteria by inverting them, then normalizing to 0-10 scale.
"""
valid_scores = {k: v for k, v in scores.items() if 0 <= v <= 10}
if len(valid_scores) < 10:
return None
negative_markers = [
"melodramatic", "shallow resolution", "unearned resolution",
"simplistic moralizing", "shallow optimism", "forced optimism",
"trite", "overwrought", "amateurish", "contrived", "uninspiring",
"characters are too good", "incongruent ending positivity",
"unearned transformations", "profundity over-reach",
"amateurish descriptives", "clunky asides", "stilted dialogue",
"tit-for-tat dialogue", "purple prose", "uncreative", "tell-don't-show",
"weak dialogue", "meandering"
]
sum_val = 0.0
for criteria, val in valid_scores.items():
crit_lower = criteria.lower().strip()
if any(neg in crit_lower for neg in negative_markers):
sum_val += (10 - val)
else:
sum_val += val
avg_val = sum_val / len(valid_scores)
return round(avg_val, 2)
def confidence_interval_95(data: List[float]) -> float:
"""
Computes the 95% confidence interval for the mean using normal approximations:
CI95 = 1.96 * (std / sqrt(n)), for n>30 or so.
"""
n = len(data)
if n < 2:
return 0.0
mean_ = statistics.mean(data)
stdev_ = statistics.pstdev(data) if n == 1 else statistics.stdev(data)
ci95 = 1.96 * (stdev_ / math.sqrt(n))
return ci95
def compute_detailed_distribution(scores):
if not scores:
return {}
return {
"count": len(scores),
"min": round(min(scores), 3),
"max": round(max(scores), 3),
"mean": round(statistics.mean(scores), 3),
"median": round(statistics.median(scores), 3),
"stdev": round(statistics.stdev(scores) if len(scores) > 1 else 0.0, 3),
"p10": round(float(np.percentile(scores, 10)), 3),
"p25": round(float(np.percentile(scores, 25)), 3),
"p75": round(float(np.percentile(scores, 75)), 3),
"p90": round(float(np.percentile(scores, 90)), 3)
}
def compute_model_level_stats(scores_by_model, lengths_by_model):
model_stats = {}
for model_name, scores in scores_by_model.items():
lengths = lengths_by_model[model_name]
stats = {
"count": len(scores),
"mean": statistics.mean(scores),
"median": statistics.median(scores),
"stdev": statistics.stdev(scores) if len(scores) > 1 else 0.0,
"ci95": confidence_interval_95(scores),
"min": min(scores),
"max": max(scores)
}
# Length correlation
if len(lengths) == len(scores):
corr, _ = scipy.stats.pearsonr(lengths, scores)
stats["length_correlation"] = corr
model_stats[model_name] = stats
return model_stats
def compute_cross_model_stats(scores_by_model_all, scores_by_model_by_iter):
"""
Computes cross-model stats. ANOVA, Kruskal, and std_dev_across_models
are calculated over all scores. Pearson/Kendall correlations are now
computed on a per-iteration basis and then averaged.
"""
# ANOVA/Kruskal portion remains over ALL model scores
arrays = list(scores_by_model_all.values())
f_stat, f_p = scipy.stats.f_oneway(*arrays)
kw_stat, kw_p = scipy.stats.kruskal(*arrays)
# Standard deviation across model means (over entire distribution)
model_means = [statistics.mean(scores) for scores in arrays]
std_across_models = statistics.pstdev(model_means)
# --------------------
# Compute correlation stats per iteration, then average
# --------------------
iteration_keys = set()
for model, by_iter_dict in scores_by_model_by_iter.items():
iteration_keys |= set(by_iter_dict.keys()) # union of all iteration keys
iteration_pearsons = []
iteration_kendalls = []
for it_key in iteration_keys:
# Gather means for each model in this iteration
model_means_dict = {}
for model, by_iter_dict in scores_by_model_by_iter.items():
if it_key in by_iter_dict and len(by_iter_dict[it_key]) > 0:
model_means_dict[model] = statistics.mean(by_iter_dict[it_key])
# Pair each model's iteration-mean with reference score
ref_pairs = []
for m, mean_val in model_means_dict.items():
if m in REFERENCE_MODEL_SCORES:
ref_pairs.append((mean_val, REFERENCE_MODEL_SCORES[m]))
# If enough models exist for correlation, compute it
if len(ref_pairs) >= 2:
means, refs = zip(*ref_pairs)
p_r, _ = scipy.stats.pearsonr(means, refs)
k_tau, _ = scipy.stats.kendalltau(means, refs)
else:
p_r, k_tau = (0.0, 0.0)
iteration_pearsons.append(p_r)
iteration_kendalls.append(k_tau)
# Final correlation = average across iteration-level correlations
if iteration_pearsons:
pearson_r = statistics.mean(iteration_pearsons)
else:
pearson_r = 0.0
if iteration_kendalls:
kendall_tau = statistics.mean(iteration_kendalls)
else:
kendall_tau = 0.0
return {
"anova_f": f_stat,
"anova_p": f_p,
"kw_stat": kw_stat,
"kw_p": kw_p,
"std_dev_across_models": std_across_models,
"pearson_r": pearson_r,
"kendall_tau": kendall_tau,
"normalized_components": {
"pearson_r": normalize(pearson_r, 0.7, 1.0),
"kendall_tau": normalize(kendall_tau, 0.1, 1.0),
"anova_f": normalize(f_stat, 0.0, 350.0),
"kw_stat": normalize(kw_stat, 0.0, 1500.0),
"std_dev": normalize(std_across_models, 0.0, 2.2)
}
}
def build_landmark_calibration_config(scores, desired_points=None):
"""
Creates a piecewise-linear calibration from these raw distribution
landmarks: [min, Q1, median, Q3, max]
to the given desired_points, e.g. [0, 3, 5, 7, 10].
Returns a dict describing how to transform future scores.
"""
if not scores or len(scores) < 2:
# Degenerate case: no meaningful distribution
return {
"method": "piecewise_landmark",
"in_landmarks": [],
"out_landmarks": []
}
if desired_points is None:
desired_points = [0, 3, 5, 7, 10]
in_min = min(scores)
in_q1 = float(np.percentile(scores, 25))
in_med = float(statistics.median(scores))
in_q3 = float(np.percentile(scores, 75))
in_max = max(scores)
return {
"method": "piecewise_landmark",
"in_landmarks": [in_min, in_q1, in_med, in_q3, in_max],
"out_landmarks": desired_points
}
def apply_landmark_calibration(x, config):
"""
Apply the piecewise-linear transform defined by config:
"in_landmarks" = [minVal, q1Val, medVal, q3Val, maxVal]
"out_landmarks" = [outMin, outQ1, outMed, outQ3, outMax].
If x is < min or > max, we extrapolate linearly beyond that segment.
"""
inL = config.get("in_landmarks", [])
outL = config.get("out_landmarks", [])
if len(inL) != 5 or len(outL) != 5:
# Invalid or degenerate config => just return x unchanged
return x
in_min, in_q1, in_med, in_q3, in_max = inL
out_min, out_q1, out_med, out_q3, out_max = outL
def linear_map(val, old_lo, old_hi, new_lo, new_hi):
if abs(old_hi - old_lo) < 1e-12:
return new_lo
frac = (val - old_lo) / (old_hi - old_lo)
return new_lo + frac * (new_hi - new_lo)
# Determine which segment x belongs to:
if x <= in_q1:
# (in_min -> in_q1) -> (out_min -> out_q1), but possibly x < in_min => extrapolate
return linear_map(x, in_min, in_q1, out_min, out_q1)
elif x <= in_med:
return linear_map(x, in_q1, in_med, out_q1, out_med)
elif x <= in_q3:
return linear_map(x, in_med, in_q3, out_med, out_q3)
else:
# (in_q3 -> in_max) -> (out_q3 -> out_max), possibly x > in_max => extrapolate
return linear_map(x, in_q3, in_max, out_q3, out_max)
def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
"""Log a readable summary of score statistics."""
logging.info(f"\n------- {score_type} Summary -------")
logging.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
logging.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
logging.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
logging.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
logging.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
logging.info("\nModel Scores:")
sorted_models = sorted(
model_stats.items(),
key=lambda kv: kv[1]["mean"],
reverse=True
)
for model, stats in sorted_models:
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
logging.info(line)
logging.info("------------------------------------")
+233
View File
@@ -0,0 +1,233 @@
import logging
import math
import statistics
import numpy as np
import scipy.stats
from typing import Dict, List
from utils.stats import normalize
try:
from scipy.stats import wasserstein_distance
HAS_WASSERSTEIN = True
except ImportError:
HAS_WASSERSTEIN = False
def cohen_d(scores1: List[float], scores2: List[float]) -> float:
"""
Compute Cohen's d for two sets of scores.
d = (mean2 - mean1) / pooled_stdev
"""
if len(scores1) < 2 or len(scores2) < 2:
return 0.0
mean1, mean2 = statistics.mean(scores1), statistics.mean(scores2)
var1, var2 = statistics.pvariance(scores1), statistics.pvariance(scores2)
n1, n2 = len(scores1), len(scores2)
pooled_var = ((n1 - 1)*var1 + (n2 - 1)*var2) / (n1 + n2 - 2)
if pooled_var <= 1e-12:
return 0.0
d = (mean2 - mean1) / math.sqrt(pooled_var)
return d
def ci_interval(scores: List[float], ci_level=0.99) -> tuple[float, float]:
"""
Compute mean ± z*(stdev/sqrt(n)) for the specified CI level.
Returns (low, high).
"""
if len(scores) < 2:
# trivial or empty
mean_ = statistics.mean(scores) if len(scores) == 1 else 0.0
return (mean_, mean_)
mean_ = statistics.mean(scores)
stdev_ = statistics.stdev(scores)
n = len(scores)
z = scipy.stats.norm.ppf(0.5 + ci_level/2.0) # ~2.575 for 99% CI
half_width = z * (stdev_ / math.sqrt(n))
return (mean_ - half_width, mean_ + half_width)
def ci_intervals_overlap(ci1: tuple[float, float], ci2: tuple[float, float]) -> bool:
"""
Returns True if two confidence intervals overlap.
"""
return not (ci1[1] < ci2[0] or ci2[1] < ci1[0])
def compute_distributions_distance(scores_by_model: Dict[str, List[float]]):
"""
Example EMD computation across all pairs, if you still want it.
Returns an average distance plus each pair's distance.
"""
models = list(scores_by_model.keys())
distances = {}
sum_dist = 0.0
pair_count = 0
for i in range(len(models)):
for j in range(i+1, len(models)):
mA, mB = models[i], models[j]
d = -1.0
if HAS_WASSERSTEIN and scores_by_model[mA] and scores_by_model[mB]:
d = wasserstein_distance(scores_by_model[mA], scores_by_model[mB])
distances[f"{mA}__{mB}"] = d
if d >= 0.0:
sum_dist += d
pair_count += 1
avg_dist = (sum_dist / pair_count) if pair_count else 0.0
return {
"average": avg_dist,
"pairs": distances
}
def compute_average_ci95(model_scores: Dict[str, List[float]]) -> float:
"""
Compute the average 95% CI half-width across models.
"""
if not model_scores:
return 0.0
half_widths = []
z95 = 1.96
for scores in model_scores.values():
if len(scores) < 2:
half_widths.append(0.0)
continue
stdev_ = statistics.stdev(scores)
mean_ = statistics.mean(scores)
n = len(scores)
hw = z95 * (stdev_ / math.sqrt(n))
half_widths.append(hw)
return statistics.mean(half_widths) if half_widths else 0.0
def compute_separability_metrics(
run_data: dict,
scores_by_model: Dict[str, List[float]],
label: str = "raw",
scale_factor: float = 1.5,
) -> None:
"""
Compute a few custom “separability” metrics:
• 99% CI overlap only for adjacent models (fraction)
• The *magnitude* of 99% CI overlap between adjacent models (with optional scaling of intervals)
• Single summary measure of Cohens d (e.g., average of absolute values)
• EMD across pairs (optional)
• Weighted or “modulated” metric for average CI95
Args:
run_data: A dictionary to store results.
scores_by_model: Dict of model -> list of scores.
label: String label for grouping these results in run_data.
scale_factor: If >1.0, intervals are expanded by that factor when
computing overlap magnitude. For example, 1.5 means
you increase the half-width of each interval by 50%.
"""
if "separability_metrics" not in run_data:
run_data["separability_metrics"] = {}
run_data["separability_metrics"][label] = {}
# ----------------------------------------------------------------
# 1) Basic stats: model means + 99% CI
# ----------------------------------------------------------------
model_means = {}
model_ci99 = {}
for m, sc in scores_by_model.items():
if sc:
model_means[m] = statistics.mean(sc)
model_ci99[m] = ci_interval(sc, ci_level=0.99)
else:
model_means[m] = 0.0
model_ci99[m] = (0.0, 0.0)
# Sort models by mean descending
models_sorted = sorted(model_means.keys(), key=lambda x: model_means[x], reverse=True)
# ----------------------------------------------------------------
# 2) Original “adjacent overlap fraction” (no scaling)
# ----------------------------------------------------------------
adjacent_overlap = {}
overlap_count = 0
for i in range(len(models_sorted) - 1):
mA, mB = models_sorted[i], models_sorted[i + 1]
overlap = ci_intervals_overlap(model_ci99[mA], model_ci99[mB])
adjacent_overlap[f"{mA}__{mB}"] = overlap
if overlap:
overlap_count += 1
adj_frac_overlap = overlap_count / (len(models_sorted) - 1) if len(models_sorted) > 1 else 0.0
# ----------------------------------------------------------------
# 3) “Magnitude” of 99% CI overlap between adjacent models
# with optional scaling factor
# ----------------------------------------------------------------
def scale_interval(ci: tuple[float, float], factor: float) -> tuple[float, float]:
"""
Given an interval (low, high), expand it about its midpoint by 'factor'.
E.g. if factor=1.5, the half-width becomes 1.5 * (original half-width).
"""
low, high = ci
mid = (low + high) / 2.0
half_width = (high - low) / 2.0
new_half = factor * half_width
return (mid - new_half, mid + new_half)
def interval_overlap(ciA: tuple[float, float], ciB: tuple[float, float]) -> float:
"""Return the length of the overlap between two intervals."""
return max(0.0, min(ciA[1], ciB[1]) - max(ciA[0], ciB[0]))
adjacent_overlap_magnitude = {}
sum_overlap_magnitude = 0.0
for i in range(len(models_sorted) - 1):
mA, mB = models_sorted[i], models_sorted[i + 1]
# Scale each interval before computing overlap
scaledA = scale_interval(model_ci99[mA], scale_factor)
scaledB = scale_interval(model_ci99[mB], scale_factor)
overlap_mag = interval_overlap(scaledA, scaledB)
adjacent_overlap_magnitude[f"{mA}__{mB}"] = overlap_mag
sum_overlap_magnitude += overlap_mag
# ----------------------------------------------------------------
# 4) Single measure for Cohens d (average of absolute Cohens d across adjacent pairs)
# ----------------------------------------------------------------
d_vals = []
for i in range(len(models_sorted) - 1):
mA, mB = models_sorted[i], models_sorted[i + 1]
d_val = cohen_d(scores_by_model[mA], scores_by_model[mB])
d_vals.append(abs(d_val))
avg_cohens_d = sum(d_vals) / len(d_vals) if d_vals else 0.0
# ----------------------------------------------------------------
# 5) Optional EMD across all pairs
# ----------------------------------------------------------------
emd_data = compute_distributions_distance(scores_by_model)
# ----------------------------------------------------------------
# 6) Weighted or modulated average CI95
# ----------------------------------------------------------------
avg_ci95 = compute_average_ci95(scores_by_model)
norm_ci95 = normalize(avg_ci95, 0.15, 0.45, False)
norm_cohens_d = normalize(avg_cohens_d, 0, 0.4)
modulated_ci95 = norm_ci95 * norm_cohens_d
# ----------------------------------------------------------------
# Store or log results
# ----------------------------------------------------------------
metrics_label = run_data["separability_metrics"][label]
metrics_label["ci99_overlap_adjacent"] = adjacent_overlap
metrics_label["adjacent_overlap_fraction"] = adj_frac_overlap
# New overlap magnitude stats (with scaling)
metrics_label["ci99_overlap_magnitude_adjacent"] = adjacent_overlap_magnitude
metrics_label["ci99_overlap_magnitude_sum"] = sum_overlap_magnitude
metrics_label["ci99_overlap_scale_factor"] = scale_factor
metrics_label["average_cohens_d_adjacent"] = avg_cohens_d
metrics_label["emd"] = emd_data
metrics_label["average_ci95"] = avg_ci95
metrics_label["modulated_ci95"] = modulated_ci95
# Logging summary
logging.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
logging.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
logging.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
f"{sum_overlap_magnitude:.3f}")
logging.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
logging.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
logging.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
+448
View File
@@ -0,0 +1,448 @@
import logging
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from typing import Dict
from utils.api import send_to_judge_model
from utils.file_io import save_json_file
from core.scoring import parse_scores, compute_raw_score
from config.constants import STABILITY_ITEMS, STABILITY_REPS
from utils.state import should_exit, executor
from collections import defaultdict
import statistics
import math
import scipy.stats
import random
import statistics
def extract_model_item_scores(run_data: dict, label: str = "raw"):
"""
Collect a dictionary:
model_item_scores[model][item_id] = { iteration_key: score }
for either aggregated_score_raw or aggregated_score_calibrated (controlled by 'label').
Also returns a sorted list of iteration_keys found in the data.
We assume (model, item_id) either has exactly one score per iteration or is skipped.
"""
results = run_data.get("results", {})
model_item_scores = defaultdict(lambda: defaultdict(dict))
all_iteration_keys = set()
for model_name, model_dict in results.items():
if not isinstance(model_dict, dict):
continue
for it_key, item_dict in model_dict.items():
if not isinstance(item_dict, dict):
continue
all_iteration_keys.add(it_key)
for item_id, item_info in item_dict.items():
if not isinstance(item_info, dict):
continue
if label == "raw":
score_val = item_info.get("aggregated_score_raw", None)
else:
score_val = item_info.get("aggregated_score_calibrated", None)
if isinstance(score_val, (int, float)):
model_item_scores[model_name][item_id][it_key] = score_val
sorted_iteration_keys = sorted(list(all_iteration_keys))
return model_item_scores, sorted_iteration_keys
def shuffle_iteration_assignments_by_item(
model_item_scores,
iteration_keys
):
"""
For each model, for each item_id, we have a dict of {it_key: score}, one entry per iteration.
We do a random permutation among iteration_keys so that:
For k in range(len(iteration_keys)):
* The score originally from iteration_keys[perm[k]] is assigned to iteration_keys[k].
This ensures each iteration_key gets exactly one sample for each item_id,
but “which original iterations score” is placed there is random.
Returns: iteration_averages = { new_it_key: { model: [scores...] } }
We'll later convert those lists to mean(...) for each model.
"""
# structure to gather new assignment
# iteration_averages[new_it_key][model] = list of assigned scores
from collections import defaultdict
iteration_averages = defaultdict(lambda: defaultdict(list))
n_iters = len(iteration_keys)
for model_name, item_dict in model_item_scores.items():
for item_id, it_score_map in item_dict.items():
# We expect one score for each iteration key
# If item_id is missing an iteration, that item won't be shuffled properly.
# We can skip or handle partial if needed. Here, assume it's complete:
if len(it_score_map) < n_iters:
# skip incomplete
continue
# Build an array of scores in sorted iteration_keys order
# so scores[i] corresponds to iteration_keys[i] in the original
scores_in_order = [it_score_map[k] for k in iteration_keys]
# random permutation of [0..n_iters-1]
permutation = random.sample(range(n_iters), k=n_iters)
# Now assign scores_in_order[permutation[k]] to iteration_keys[k].
for k in range(n_iters):
new_it_key = iteration_keys[k]
old_index = permutation[k]
assigned_score = scores_in_order[old_index]
iteration_averages[new_it_key][model_name].append(assigned_score)
return iteration_averages
def compute_average_iteration_averages(iteration_averages):
"""
iteration_averages is { new_it_key: { model_name: [scores...] }}
Convert that to { new_it_key: { model_name: mean_score }}.
"""
final_avg = {}
for it_key, model_dict in iteration_averages.items():
final_avg[it_key] = {}
for m, score_list in model_dict.items():
if score_list:
final_avg[it_key][m] = statistics.mean(score_list)
else:
final_avg[it_key][m] = 0.0
return final_avg
def compute_kendall_tau_for_iterations(iteration_averages):
"""
iteration_averages: { it_key: { model_name: avg_score } }
We'll build rank lists, then compare each pair of iteration_keys for Kendalls τ.
Returns average τ across iteration pairs, or 0.0 if not enough data.
"""
all_iter_keys = sorted(iteration_averages.keys())
# Build rank ordering
iteration_rank = {}
for it_key in all_iter_keys:
items = list(iteration_averages[it_key].items()) # (model, avg_score)
# sort descending by score
items.sort(key=lambda x: x[1], reverse=True)
iteration_rank[it_key] = [m for (m, sc) in items]
pairwise_taus = []
for i in range(len(all_iter_keys)):
for j in range(i+1, len(all_iter_keys)):
ik = all_iter_keys[i]
jk = all_iter_keys[j]
rank_i = iteration_rank[ik]
rank_j = iteration_rank[jk]
common_models = set(rank_i).intersection(rank_j)
if len(common_models) < 2:
continue
pos_i = {m: rank_i.index(m) for m in common_models}
pos_j = {m: rank_j.index(m) for m in common_models}
x = []
y = []
for m in common_models:
x.append(pos_i[m])
y.append(pos_j[m])
if len(x) > 1:
tau, pval = scipy.stats.kendalltau(x, y)
if not math.isnan(tau):
pairwise_taus.append(tau)
if pairwise_taus:
return statistics.mean(pairwise_taus)
else:
return 0.0
def compute_randomized_iteration_rank_stability_by_item(
run_data: dict,
label: str = "raw",
n_shuffles: int = 1000
) -> float:
"""
For each item_id in each model, we shuffle which iteration key
receives that items original score (a random permutation among
the iteration_keys).
This ensures that each iteration_key still has exactly one sample for each item
(thus preserving the structure of “N iterations, M items each”),
but randomizes *which original iterations score* ended up in each iteration.
Then we compute the average Kendall's τ across iteration pairs
(ranking stability). We repeat n_shuffles times, returning the
average of those τ values.
We'll store the result in:
run_data["iteration_stability"][label]["randomized_average_kendall_tau_by_item"].
This approach is more efficient than reconstructing big data structures
or re-running a large function every time.
"""
model_item_scores, iteration_keys = extract_model_item_scores(run_data, label=label)
if len(iteration_keys) < 2:
return 0.0 # no real iteration comparison
tau_values = []
for _ in range(n_shuffles):
# Step 1: Randomly assign each items scores to iteration_keys
iteration_averages_shuffled = shuffle_iteration_assignments_by_item(model_item_scores, iteration_keys)
# Step 2: Convert from [list of scores] to mean
final_avg = compute_average_iteration_averages(iteration_averages_shuffled)
# Step 3: Compute rank correlation across iteration_keys
tau = compute_kendall_tau_for_iterations(final_avg)
tau_values.append(tau)
if tau_values:
randomized_avg_tau = statistics.mean(tau_values)
else:
randomized_avg_tau = 0.0
# Store in run_data
if "iteration_stability" not in run_data:
run_data["iteration_stability"] = {}
if label not in run_data["iteration_stability"]:
run_data["iteration_stability"][label] = {}
run_data["iteration_stability"][label]["randomized_average_kendall_tau_by_item"] = randomized_avg_tau
return randomized_avg_tau
def compute_iteration_stability(run_data: dict, label="raw"):
"""
Example function that:
1) Gathers iteration-average scores for each model.
2) Computes 'scoring stability' across iterations.
3) Computes 'ranking stability' across iterations.
4) Stores results in run_data["iteration_stability"][label].
label could be "raw" or "calibrated"—adjust as needed.
"""
# Make a place to store results:
if "iteration_stability" not in run_data:
run_data["iteration_stability"] = {}
run_data["iteration_stability"][label] = {}
results = run_data.get("results", {})
# 1) Gather iteration_averages[iter_key][model] = average_score
iteration_averages = defaultdict(dict) # iter_key -> {model -> avg_score}
for model_name, model_data in results.items():
# model_data: iter_key -> item_dict
if not isinstance(model_data, dict):
continue
for iter_key, item_dict in model_data.items():
if not isinstance(item_dict, dict):
continue
# Collect all items' scores for (this model, iter_key)
scores = []
for item_id, item_info in item_dict.items():
if not isinstance(item_info, dict):
continue
if label == "raw":
val = item_info.get("aggregated_score_raw", None)
else:
val = item_info.get("aggregated_score_calibrated", None)
if isinstance(val, (int, float)):
scores.append(val)
if scores:
iteration_averages[iter_key].setdefault(model_name, [])
iteration_averages[iter_key][model_name] = statistics.mean(scores)
# 2) Scoring Stability
# We can do something like: for each model, gather all iteration_averages and store stdev
model_stability = {}
# We also want to track how many times each model appears (some might not appear in all iter_keys)
all_iter_keys = sorted(iteration_averages.keys())
for model_name in results.keys():
# gather the iteration means
vals = []
for it_key in all_iter_keys:
if model_name in iteration_averages[it_key]:
vals.append(iteration_averages[it_key][model_name])
if len(vals) > 1:
stdev_ = statistics.pstdev(vals) if len(vals) > 1 else 0.0
model_stability[model_name] = {
"mean_iter_score": statistics.mean(vals),
"iteration_count": len(vals),
"stdev_across_iters": stdev_
}
else:
model_stability[model_name] = {
"mean_iter_score": vals[0] if vals else 0.0,
"iteration_count": len(vals),
"stdev_across_iters": 0.0
}
run_data["iteration_stability"][label]["scoring_stability"] = model_stability
# 3) Ranking Stability
# For each iteration, produce a list of (model, iteration_average)
# Then create a rank ordering. We'll store them in a dictionary: iteration_rank[iter_key] = [model1, model2, ...]
iteration_rank = {}
for iter_key in all_iter_keys:
# build a list of (model, avg_score) then sort descending
items = list(iteration_averages[iter_key].items())
items.sort(key=lambda x: x[1], reverse=True)
# The rank order is just model names in sorted order:
iteration_rank[iter_key] = [m for (m, sc) in items]
# now for each pair (i, j) of iteration_keys, compute rank correlation
# you can pick Kendall or Spearman. We'll do Kendall as example:
pairwise_rank_corr = {}
sorted_iter_keys = list(all_iter_keys)
for i in range(len(sorted_iter_keys)):
for j in range(i+1, len(sorted_iter_keys)):
ik = sorted_iter_keys[i]
jk = sorted_iter_keys[j]
rank_i = iteration_rank[ik] # not iteration_rank[i]
rank_j = iteration_rank[jk] # not iteration_rank[j]
# build positions
# If a model is missing in either iteration, skip it
common_models = set(rank_i).intersection(rank_j)
# create index mappings
pos_i = {m: rank_i.index(m) for m in common_models}
pos_j = {m: rank_j.index(m) for m in common_models}
# now build x/y from pos_i, pos_j
x = []
y = []
for m in common_models:
x.append(pos_i[m])
y.append(pos_j[m])
if len(x) > 1:
tau, pval = scipy.stats.kendalltau(x, y)
else:
tau, pval = (0.0, 1.0)
key_name = f"{ik}__vs__{jk}"
pairwise_rank_corr[key_name] = {
"common_model_count": len(common_models),
"kendall_tau": tau,
"p_value": pval
}
run_data["iteration_stability"][label]["ranking_stability"] = {
"pairwise_correlation": pairwise_rank_corr
}
# Optionally, you might compute an "average" or "median" rank correlation across all iteration pairs
if pairwise_rank_corr:
all_taus = [v["kendall_tau"] for v in pairwise_rank_corr.values() if not math.isnan(v["kendall_tau"])]
run_data["iteration_stability"][label]["ranking_stability"]["average_kendall_tau"] = statistics.mean(all_taus) if all_taus else 0.0
def process_stability_test_item(model_name, iteration_key, item_id, item_text, prompt_template, judge_model) -> float:
"""Process a single judge request for the stability test. Returns the aggregated score or 0.0 if failed."""
global should_exit
if should_exit:
return 0.0
try:
final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")
messages = [{"role": "user", "content": final_prompt}]
judge_response = send_to_judge_model(messages, judge_model=judge_model)
extracted_scores = parse_scores(judge_response)
item_score = compute_raw_score(extracted_scores)
# Only return actual valid scores, never None
return item_score if isinstance(item_score, (int, float)) and item_score > 0.0 else 0.0
except Exception as e:
logging.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}")
return 0.0
def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, runs_file, lock, num_threads):
"""Run stability test, retrying any missing entries to reach STABILITY_REPS per item."""
logging.info("Running stability test for selected items...")
if "stability_test_results" not in run_data:
run_data["stability_test_results"] = {}
items_to_process = []
for (model, iteration, item_id) in STABILITY_ITEMS:
key_name = f"{model}-{iteration}-{item_id}"
existing_results = run_data["stability_test_results"].get(key_name, [])
# Filter out failed results (0.0 scores, None values) from existing
valid_results = [score for score in existing_results
if isinstance(score, (int, float)) and score > 0.0]
needed_count = STABILITY_REPS - len(valid_results)
if needed_count > 0:
item_text = samples_data.get(model, {}).get("samples", {}).get(iteration, {}).get(item_id, "")
prompt_template = judge_prompts.get(item_id, "")
for _ in range(needed_count):
items_to_process.append({
"model": model,
"iteration": iteration,
"item_id": item_id,
"item_text": item_text,
"prompt_template": prompt_template,
"key_name": key_name
})
logging.info(f"Need {needed_count} more stability test results for {key_name}")
# Clean up existing results, keeping only valid scores
run_data["stability_test_results"][key_name] = valid_results
if not items_to_process:
logging.info("All stability test items already have complete results")
return
with ThreadPoolExecutor(max_workers=num_threads) as exec_:
futures_to_items = {}
# Launch futures for all needed retries
for item in items_to_process:
if should_exit:
break
future = exec_.submit(
process_stability_test_item,
item["model"], item["iteration"], item["item_id"],
item["item_text"], item["prompt_template"],
judge_model
)
futures_to_items[future] = item
# Collect results as they complete
for future in tqdm(concurrent.futures.as_completed(futures_to_items.keys()),
total=len(futures_to_items),
desc="Stability Test"):
if should_exit:
break
try:
score = future.result()
item = futures_to_items[future]
key_name = item["key_name"]
# Only store actual valid scores
if isinstance(score, (int, float)) and score > 0.0:
with lock:
run_data["stability_test_results"][key_name].append(score)
save_json_file(runs, runs_file)
else:
logging.warning(f"Got invalid score for stability item {key_name}, will need retry")
except Exception as exc:
logging.error(f"Exception in stability test: {exc}")
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+125
View File
@@ -0,0 +1,125 @@
import sys
import signal
import argparse
import logging
import time
from utils.logging_setup import setup_logging, get_verbosity
from utils.file_io import load_json_file
from core.benchmark import run_judgemark_v2
from utils.api import API_KEY
from utils.state import should_exit, executor
def signal_handler(signum, frame):
"""Handle interrupt signals (SIGINT, SIGTERM)."""
global executor, should_exit
print(f"\n[DEBUG] Signal {signum} caught!")
logging.warning("Signal handler called")
should_exit = True
time.sleep(0.1) # Give workers a moment to see the flag
if executor:
logging.info("Shutting down executor from signal handler")
executor.shutdown(wait=False)
logging.info("Executor shutdown complete")
sys.exit(1)
def parse_args():
parser = argparse.ArgumentParser(description='Run Judgemark-v2 Benchmark')
parser.add_argument(
'--judge-model',
required=True,
help='Judge model identifier (e.g., openai/gpt-4)'
)
parser.add_argument(
'--samples-file',
default="data/judgemark_v2.1_samples.json",
help='JSON file containing pre-generated samples from various writer models'
)
parser.add_argument(
'--prompts-file',
default="data/judge_prompts.json",
help='JSON file containing the partial judge prompts to be filled with test responses'
)
parser.add_argument(
'--runs-file',
default="judgemark_v2_runs.json",
help='Path to store the Judgemark run results'
)
parser.add_argument(
'--run-id',
help='Resume (or create) a run using this base ID, to be combined with the judge model name'
)
parser.add_argument(
'--threads',
type=int,
default=6,
help='Number of threads to use'
)
parser.add_argument(
'--verbosity',
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
help='Set the logging verbosity level'
)
parser.add_argument(
'--num-runs',
type=int,
default=1,
help='Number of benchmark runs to execute'
)
parser.add_argument(
'--save-raw-judge-output',
action='store_true',
default=False,
help='If set, store the raw judge model output in the results JSON (default: false)'
)
return parser.parse_args()
if __name__ == "__main__":
# Register signal handlers
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Reset sentinel
should_exit = False
# Parse args
args = parse_args()
# Setup logging
verbosity = get_verbosity(args.verbosity)
setup_logging(verbosity)
logging.debug("Logging initialized")
# Check that we have an API key for the judge model
if not API_KEY:
logging.critical("No OPENAI_API_KEY found in environment variables.")
raise ValueError("OPENAI_API_KEY not found in environment variables.")
run_ids = []
for i in range(1, args.num_runs + 1):
if should_exit:
break
logging.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}")
rid = run_judgemark_v2(
judge_model=args.judge_model,
samples_file=args.samples_file,
prompts_file=args.prompts_file,
runs_file=args.runs_file,
num_threads=args.threads,
run_id=args.run_id,
save_raw_judge_output=args.save_raw_judge_output
)
run_ids.append(rid)
# Finally, print summary
runs = load_json_file(args.runs_file)
logging.info("\nAll Judgemark-v2 runs completed:")
print("\nAll Judgemark-v2 runs completed:")
for rid in run_ids:
rd = runs.get(rid, {})
final_score = rd.get("final_judgemark_score", "N/A")
logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
print(f"Run ID: {rid}")
print(f"Final Judgemark-v2 Score: {final_score}")
+3
View File
@@ -0,0 +1,3 @@
matplotlib>=3.7
transformers>=4.26
scipy>=1.10
Binary file not shown.

After

Width:  |  Height:  |  Size: 298 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 298 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 293 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 299 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 295 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 298 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 293 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 299 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 292 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 296 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 291 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 301 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

+16
View File
@@ -0,0 +1,16 @@
model,judgemark_score,judgemark_score_raw,kendall_tau_bootstrapped,std_dev,kw_stat,ci99_overlap_magnitude_sum_norm,calibrated_score_range_norm,modulated_ci95,emd_norm
ministral/Ministral-3b-instruct,7.5,15.06,0.0,0.099,0.013,0.094,0.858,0.012,0.363
gpt-4o-mini,44.05,44.35,0.628,0.438,0.298,0.324,3.149,0.109,1.187
deepseek-ai/deepseek-r1,76.97,71.77,0.863,0.821,0.905,0.683,6.098,0.6,2.195
claude-3.5-haiku-20241022,77.64,76.88,0.898,0.832,0.891,0.665,5.975,0.575,2.229
claude-3.5-sonnet-20240620,78.84,74.72,0.905,0.83,0.902,0.664,6.255,0.591,2.232
microsoft/wizardlm-2-8x22b,55.42,52.86,0.749,0.566,0.463,0.417,4.62,0.198,1.53
gpt-4o-2024-11-20,82.51,77.75,0.89,0.837,0.985,0.75,6.233,0.814,2.228
gemini-2.0-flash-thinking-exp-1219,71.79,66.1,0.818,0.765,0.817,0.631,5.519,0.453,2.057
deepseek-ai/deepseek-r1-distill-llama-70b,59.67,56.86,0.764,0.642,0.6,0.474,4.485,0.243,1.726
mistralai/mistral-large-instruct-2411,65.63,63.25,0.794,0.732,0.636,0.505,5.873,0.296,1.935
meta-llama/llama-3.1-8b-instruct,35.72,35.67,0.46,0.401,0.201,0.207,3.232,0.063,1.075
meta-llama/llama-3.1-405b-instruct,60.09,58.76,0.791,0.633,0.582,0.471,4.515,0.243,1.712
deepseek/deepseek-r1-distill-qwen-14b,,,,,,,,,
mistralai/mistral-small-24b-instruct-2501,63.64,61.23,0.808,0.67,0.618,0.504,5.194,0.286,1.802
liquid/lfm-7b,10.24,14.33,0.139,0.192,0.027,0.0,1.702,0.0,0.584
1 model judgemark_score judgemark_score_raw kendall_tau_bootstrapped std_dev kw_stat ci99_overlap_magnitude_sum_norm calibrated_score_range_norm modulated_ci95 emd_norm
2 ministral/Ministral-3b-instruct 7.5 15.06 0.0 0.099 0.013 0.094 0.858 0.012 0.363
3 gpt-4o-mini 44.05 44.35 0.628 0.438 0.298 0.324 3.149 0.109 1.187
4 deepseek-ai/deepseek-r1 76.97 71.77 0.863 0.821 0.905 0.683 6.098 0.6 2.195
5 claude-3.5-haiku-20241022 77.64 76.88 0.898 0.832 0.891 0.665 5.975 0.575 2.229
6 claude-3.5-sonnet-20240620 78.84 74.72 0.905 0.83 0.902 0.664 6.255 0.591 2.232
7 microsoft/wizardlm-2-8x22b 55.42 52.86 0.749 0.566 0.463 0.417 4.62 0.198 1.53
8 gpt-4o-2024-11-20 82.51 77.75 0.89 0.837 0.985 0.75 6.233 0.814 2.228
9 gemini-2.0-flash-thinking-exp-1219 71.79 66.1 0.818 0.765 0.817 0.631 5.519 0.453 2.057
10 deepseek-ai/deepseek-r1-distill-llama-70b 59.67 56.86 0.764 0.642 0.6 0.474 4.485 0.243 1.726
11 mistralai/mistral-large-instruct-2411 65.63 63.25 0.794 0.732 0.636 0.505 5.873 0.296 1.935
12 meta-llama/llama-3.1-8b-instruct 35.72 35.67 0.46 0.401 0.201 0.207 3.232 0.063 1.075
13 meta-llama/llama-3.1-405b-instruct 60.09 58.76 0.791 0.633 0.582 0.471 4.515 0.243 1.712
14 deepseek/deepseek-r1-distill-qwen-14b
15 mistralai/mistral-small-24b-instruct-2501 63.64 61.23 0.808 0.67 0.618 0.504 5.194 0.286 1.802
16 liquid/lfm-7b 10.24 14.33 0.139 0.192 0.027 0.0 1.702 0.0 0.584
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+51
View File
@@ -0,0 +1,51 @@
import os
import time
import logging
import requests
from typing import List, Dict
from dotenv import load_dotenv
# Load environment variables from .env if present
load_dotenv()
BASE_URL = os.getenv("OPENAI_API_URL", "https://openrouter.ai/api/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY")
HEADERS = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
RETRY_DELAY = int(os.getenv("RETRY_DELAY", "5"))
def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int = MAX_RETRIES) -> str:
"""
Sends user messages to the judge model with basic retry logic.
Expects an OpenAI-compatible endpoint.
"""
for attempt in range(1, max_retries + 1):
try:
# temp and top_k are set to produce diversity in judge outputs between runs,
# but constrained to be near the model's best answer (since we are doing numerical scoring).
data = {
"model": judge_model,
"messages": messages,
"temperature": 0.5,
"top_k": 3,
"max_tokens": 8096,
#"provider": {
# "order": [
# "DeepSeek"
# ]
#}
}
response = requests.post(BASE_URL, headers=HEADERS, json=data)
response.raise_for_status()
res_json = response.json()
return res_json['choices'][0]['message']['content']
except Exception as e:
logging.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}")
if attempt == max_retries:
logging.critical(f"Max retries reached for judge model {judge_model}")
raise
time.sleep(RETRY_DELAY)
return ""
+17
View File
@@ -0,0 +1,17 @@
import json
import logging
def load_json_file(file_path: str) -> dict:
"""Loads a JSON file (returns empty if not found)."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
logging.warning(f"File {file_path} not found, returning empty dict.")
return {}
def save_json_file(data: dict, file_path: str):
"""Saves a dict to disk as JSON."""
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
logging.debug(f"Saved JSON data to {file_path}")
+24
View File
@@ -0,0 +1,24 @@
import os
import logging
def setup_logging(verbosity: str):
"""Set up logging based on verbosity level."""
log_levels = {
'DEBUG': logging.DEBUG,
'INFO': logging.INFO,
'WARNING': logging.WARNING,
'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL
}
log_level = log_levels.get(verbosity.upper(), logging.INFO)
logging.basicConfig(
level=log_level,
format='%(asctime)s - %(levelname)s - %(threadName)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
def get_verbosity(args_verbosity: str) -> str:
"""Determine the verbosity level from command-line or environment."""
if args_verbosity:
return args_verbosity
return os.getenv("LOG_VERBOSITY", "INFO")
+3
View File
@@ -0,0 +1,3 @@
# Global control flags
should_exit = False
executor = None
+11
View File
@@ -0,0 +1,11 @@
def clamp(x: float, low: float = 0.0, high: float = 1.0) -> float:
"""Clamp a value between low and high."""
return max(low, min(x, high))
def normalize(val, min_val, max_val, bigger_is_better=True):
if max_val <= min_val:
return 0.0
norm = (val - min_val) / (max_val - min_val)
if not bigger_is_better:
norm = 1.0 - norm
return clamp(norm)
+240
View File
@@ -0,0 +1,240 @@
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from typing import Dict
from scipy.stats import linregress
from scipy.stats import spearmanr, theilslopes
from config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict):
"""
Produces two figures:
• Figure #1 with three subplots side-by-side:
(1) Raw Scores bar chart (+ 95% CI)
(2) Calibrated Scores bar chart (+ 95% CI)
(3) Heatmap of all per-criterion scores across each model (10 - score for negative markers).
• Figure #2: A 4×4 grid of mini scatter plots, one per model (up to 16),
showing item length (chars) vs. aggregated_score_raw. A linear regression
line and correlation stats are included for each model if enough points exist.
"""
# -------------------------------------------------------------------
# 1) The main (raw / calibrated / heatmap) figure
# -------------------------------------------------------------------
raw_stats = run_data["raw_model_stats"]
cal_stats = run_data["calibrated_model_stats"]
if judge_model in MODEL_NAME_REPLACEMENTS:
judge_model = MODEL_NAME_REPLACEMENTS[judge_model]
# All model names in raw_stats
model_names = list(raw_stats.keys())
# Convert to arrays for sorting
raw_means = [raw_stats[m]["mean"] for m in model_names]
cal_means = [cal_stats[m]["mean"] for m in model_names]
raw_cis = [raw_stats[m]["ci95"] for m in model_names]
cal_cis = [cal_stats[m]["ci95"] for m in model_names]
# Sort by calibrated score descending
sorted_indices = np.argsort(cal_means)[::-1]
model_names = [model_names[i] for i in sorted_indices]
raw_means = [raw_means[i] for i in sorted_indices]
cal_means = [cal_means[i] for i in sorted_indices]
raw_cis = [raw_cis[i] for i in sorted_indices]
cal_cis = [cal_cis[i] for i in sorted_indices]
# 1.A) Build data for the heatmap: per-criterion scores (with negative flips)
# We gather them from run_data "parsed_scores"
# Then we convert them to 0..10 bins and store percentage distribution.
all_scores_by_model = {m: [] for m in model_names}
results = run_data.get("results", {})
for model_name in model_names:
iter_dict = results.get(model_name, {})
for iteration_key, item_dict in iter_dict.items():
if not isinstance(item_dict, dict):
continue
for item_id, item_info in item_dict.items():
if not isinstance(item_info, dict):
continue
parsed_scores = item_info.get("parsed_scores", {})
if not isinstance(parsed_scores, dict):
continue
for crit_name, val in parsed_scores.items():
if isinstance(val, (int, float)) and 0 <= val <= 10:
crit_lower = crit_name.strip().lower()
# Flip negative
if any(nm in crit_lower for nm in NEGATIVE_MARKERS):
final_val = 10 - val
else:
final_val = val
all_scores_by_model[model_name].append(final_val)
# Convert to a 2D array for the heatmap (rows = models, columns = bins)
bins = np.linspace(0, 10, 11)
heatmap_rows = []
for m in model_names:
scores = all_scores_by_model[m]
if scores:
counts, _ = np.histogram(scores, bins=bins)
pct = (counts / len(scores)) * 100.0
else:
pct = np.zeros(len(bins)-1, dtype=float)
heatmap_rows.append(pct)
heatmap_data = np.array(heatmap_rows, dtype=float)
# 1.B) Plot the main figure with 3 subplots
fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7)) # Increased height
# Increase the base font size for all text elements
plt.rcParams.update({'font.size': 14})
# (A) Raw bar chart
y_pos = np.arange(len(model_names))
ax1.barh(y_pos, raw_means, color='skyblue', alpha=0.7)
for i, (mean_val, ci95) in enumerate(zip(raw_means, raw_cis)):
ax1.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
ax1.set_yticks(y_pos)
ax1.set_yticklabels(model_names, fontsize=12)
ax1.invert_yaxis()
ax1.set_xlabel("Raw Scores", fontsize=14)
ax1.set_title("Raw Model Scores (95% CI)", fontsize=16)
ax1.grid(True, alpha=0.3)
ax1.tick_params(axis='y', pad=10)
# (B) Calibrated bar chart
ax2.barh(y_pos, cal_means, color='lightgreen', alpha=0.7)
for i, (mean_val, ci95) in enumerate(zip(cal_means, cal_cis)):
ax2.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
ax2.set_yticks(y_pos)
ax2.set_yticklabels(model_names, fontsize=12)
ax2.invert_yaxis()
ax2.set_xlabel("Calibrated Scores", fontsize=14)
ax2.set_title("Calibrated Model Scores (95% CI)", fontsize=16)
ax2.grid(True, alpha=0.3)
ax2.tick_params(axis='y', pad=10)
# (C) Heatmap
bin_edges = bins[:-1]
ax3.set_xticks(np.arange(len(bin_edges)))
ax3.set_xticklabels([str(int(be)) for be in bin_edges])
im = ax3.imshow(heatmap_data, aspect='auto', origin='upper', cmap='plasma')
#ax3.set_xticks(np.arange(len(bin_centers)))
#ax3.set_xticklabels([f"{bc:.0f}" for bc in bin_centers], fontsize=12)
ax3.set_yticks(np.arange(len(model_names)))
ax3.set_yticklabels(model_names, fontsize=12)
ax3.set_xlabel("Score Bin (010)", fontsize=14)
ax3.set_title("Per-Criterion Score Distribution (Heatmap)", fontsize=16)
ax3.tick_params(axis='y', pad=10)
cbar = plt.colorbar(im, ax=ax3)
cbar.set_label("% of Criteria in Bin", fontsize=14)
cbar.ax.tick_params(labelsize=12)
cbar.formatter.set_powerlimits((0, 0))
cbar.formatter = ticker.PercentFormatter(decimals=1)
cbar.update_ticks()
# Overall title
sanitized_judge = re.sub(r"[^\w\-]", "-", judge_model.replace("/", "__"))
fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}", fontsize=20)
# Adjust layout with more space
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(f"results/charts/judgemark_3chart_{sanitized_judge}.png",
bbox_inches='tight',
dpi=150,
pad_inches=0.5)
plt.close(fig1)
# -------------------------------------------------------------------
# 2) Second Figure: A 4×4 grid of scatter plots (per-model),
# where X = item length, Y = aggregated_score_raw.
# We'll gather text straight from samples_data, compute len(...).
# -------------------------------------------------------------------
# If you want exactly 16 models, skip any extra or exclude some.
# Example: exclude "gemini-1.5-pro-001"
excluded_models = {"gemini-1.5-pro-001"}
model_list_for_scatter = [m for m in model_names if m not in excluded_models]
# If it's still longer than 16, slice it to 16
if len(model_list_for_scatter) > 16:
model_list_for_scatter = model_list_for_scatter[:16]
# Build figure and subplots: 4x4
fig2, axes2 = plt.subplots(4, 4, figsize=(20, 20)) # each cell is a scatter
fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}", fontsize=18)
# We might have fewer than 16 models. We'll track them by row & col.
for idx, mname in enumerate(model_list_for_scatter):
row = idx // 4
col = idx % 4
ax = axes2[row, col]
# Collect all (length, raw_score) for this model
length_vals = []
score_vals = []
# For each (iteration_key, item_id), find the text in samples_data,
# find aggregated_score_raw in run_data, then store pairs
model_res = run_data["results"].get(mname, {})
for it_key, it_dict in model_res.items():
if not isinstance(it_dict, dict):
continue
for item_id, item_info in it_dict.items():
if not isinstance(item_info, dict):
continue
raw_score = item_info.get("aggregated_score_raw", None)
if not isinstance(raw_score, (int, float)):
continue
# Look up the text in samples_data:
text = (samples_data
.get(mname, {})
.get("samples", {})
.get(it_key, {})
.get(item_id, ""))
text_len = len(text)
# If it's non-empty text
if text_len > 0:
length_vals.append(text_len)
score_vals.append(raw_score)
ax.set_title(mname, fontsize=12)
ax.set_xlabel("Length")
ax.set_ylabel("Raw Score")
if len(length_vals) > 1:
ax.scatter(length_vals, score_vals, alpha=0.4, color='blue')
# -- Rank-based correlation (Spearman) --
rho, p_value = spearmanr(length_vals, score_vals)
# -- Robust linear fit (Theil-Sen) --
# returns slope, intercept, lower_slope, upper_slope
slope, intercept, lo_slope, hi_slope = theilslopes(score_vals, length_vals, alpha=0.95)
# Build the line
xline = np.linspace(min(length_vals), max(length_vals), 200)
yline = slope * xline + intercept
ax.plot(xline, yline, color='red', linewidth=2,
label=f"Spearman ρ={rho:.2f}, p={p_value:.2g}")
ax.legend(loc="best")
else:
ax.text(0.5, 0.5, "Not enough data",
ha='center', va='center', color='red', transform=ax.transAxes)
# If we have fewer than 16 models, blank out remaining subplots
total_subplots = 16
for i in range(len(model_list_for_scatter), total_subplots):
row = i // 4
col = i % 4
axes2[row, col].axis("off")
plt.tight_layout()
plt.savefig(f"results/charts/judgemark_scattergrid_{sanitized_judge}.png", bbox_inches='tight', dpi=200)
plt.close(fig2)