first commit
@@ -0,0 +1,8 @@
|
||||
.env
|
||||
.vscode
|
||||
backup
|
||||
judgemark_v2_runs.json
|
||||
judgemark_v2.1_runs.json
|
||||
ministral-repetition-results.json
|
||||
*.pyc
|
||||
dev
|
||||
@@ -0,0 +1,55 @@
|
||||
"""Global constants and reference scores for the Judgemark-v2 benchmark."""
|
||||
|
||||
# Stability test configuration
|
||||
STABILITY_ITEMS = [
|
||||
("gemma-2b-it", "2", "28"),
|
||||
("Llama-3-70b-chat-hf", "1", "42"),
|
||||
("DeepSeek-R1", "1", "2"),
|
||||
]
|
||||
STABILITY_REPS = 100
|
||||
|
||||
# Reference scores for correlation
|
||||
REFERENCE_MODEL_SCORES = {
|
||||
"DeepSeek-R1": 1430, # !! need to update this
|
||||
"gpt-4o-2024-11-20": 1402,
|
||||
"gemini-1.5-pro-002": 1333,
|
||||
"gemini-1.5-pro-001": 1276,
|
||||
"claude-3-5-sonnet-20240620": 1243,
|
||||
"Mistral-Large-Instruct-2411": 1246,
|
||||
"claude-3-opus-20240229": 1240,
|
||||
#"c4ai-command-r-plus-08-2024": 1236,
|
||||
"Llama-3-70b-chat-hf": 1214,
|
||||
"claude-3-haiku-20240307": 1163,
|
||||
"c4ai-command-r-08-2024": 1159,
|
||||
"Mixtral-8x22B-Instruct-v0.1": 1147,
|
||||
"Mixtral-8x7B-Instruct-v0.1": 1114,
|
||||
"databricks/dbrx-instruct": 1102,
|
||||
#"openchat-3.5-1210": 1127,
|
||||
"gpt-3.5-turbo-0125": 1099,
|
||||
"Llama-2-13b-chat-hf": 1050,
|
||||
"gemma-7b-it": 1029,
|
||||
"gemma-2b-it": 989,
|
||||
}
|
||||
|
||||
# Negative criteria markers for score computation
|
||||
NEGATIVE_MARKERS = [
|
||||
"melodramatic", "shallow resolution", "unearned resolution",
|
||||
"simplistic moralizing", "shallow optimism", "forced optimism",
|
||||
"trite", "overwrought", "amateurish", "contrived", "uninspiring",
|
||||
"characters are too good", "incongruent ending positivity",
|
||||
"unearned transformations", "profundity over-reach",
|
||||
"amateurish descriptives", "clunky asides", "stilted dialogue",
|
||||
"tit-for-tat dialogue", "purple prose", "uncreative", "tell-don't-show",
|
||||
"weak dialogue", "meandering"
|
||||
]
|
||||
|
||||
MODEL_NAME_REPLACEMENTS = {
|
||||
"mistralai/ministral-3b": "ministral/Ministral-3b-instruct",
|
||||
"openai/gpt-4o-mini": "gpt-4o-mini",
|
||||
"deepseek/deepseek-r1": "deepseek-ai/deepseek-r1",
|
||||
"anthropic/claude-3.5-haiku-20241022": "claude-3.5-haiku-20241022",
|
||||
"anthropic/claude-3.5-sonnet-20240620": "claude-3.5-sonnet-20240620",
|
||||
"openai/gpt-4o-2024-11-20": "gpt-4o-2024-11-20",
|
||||
"deepseek/deepseek-r1-distill-llama-70b": "deepseek-ai/deepseek-r1-distill-llama-70b",
|
||||
"mistralai/mistral-large-2411": "mistralai/mistral-large-instruct-2411",
|
||||
}
|
||||
@@ -0,0 +1,514 @@
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
import time
|
||||
import signal
|
||||
import logging
|
||||
import threading
|
||||
import concurrent.futures
|
||||
from datetime import datetime
|
||||
from typing import Dict, List
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
from utils.file_io import load_json_file, save_json_file
|
||||
from utils.api import send_to_judge_model
|
||||
from utils.visualization import create_side_by_side_score_charts
|
||||
import statistics
|
||||
from core.scoring import (
|
||||
parse_scores, compute_raw_score, compute_detailed_distribution,
|
||||
compute_model_level_stats, compute_cross_model_stats,
|
||||
build_landmark_calibration_config, apply_landmark_calibration,
|
||||
log_score_summary, confidence_interval_95
|
||||
)
|
||||
from core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
|
||||
from core.separability import compute_separability_metrics
|
||||
from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
|
||||
from utils.stats import clamp
|
||||
from utils.state import should_exit, executor
|
||||
from utils.stats import normalize
|
||||
|
||||
def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str,
|
||||
prompt_template: str, run_key: str, runs: Dict, runs_file: str,
|
||||
lock: threading.Lock, judge_model: str, save_raw_judge_output: bool):
|
||||
"""Process a single sample, retrying failed or empty results."""
|
||||
global should_exit
|
||||
if should_exit:
|
||||
return
|
||||
|
||||
text_len = len(item_text)
|
||||
run_data = runs.get(run_key, {})
|
||||
results = run_data.get("results", {})
|
||||
model_dict = results.setdefault(model_name, {})
|
||||
iteration_dict = model_dict.setdefault(iteration_key, {})
|
||||
|
||||
existing_item = iteration_dict.get(item_id, {})
|
||||
if (existing_item and
|
||||
"aggregated_score_raw" in existing_item and
|
||||
existing_item.get("parsed_scores") and
|
||||
len(existing_item["parsed_scores"]) >= 10 and
|
||||
existing_item["aggregated_score_raw"] > 0.0):
|
||||
return
|
||||
|
||||
try:
|
||||
final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
|
||||
final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")
|
||||
|
||||
messages = [{"role": "user", "content": final_prompt}]
|
||||
judge_response = send_to_judge_model(messages, judge_model=judge_model)
|
||||
|
||||
extracted_scores = parse_scores(judge_response)
|
||||
raw_score = compute_raw_score(extracted_scores)
|
||||
|
||||
with lock:
|
||||
storage_dict = {
|
||||
"parsed_scores": extracted_scores,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"text_length": text_len
|
||||
}
|
||||
if raw_score is not None:
|
||||
storage_dict["aggregated_score_raw"] = raw_score
|
||||
if save_raw_judge_output:
|
||||
storage_dict["judge_response"] = judge_response
|
||||
|
||||
iteration_dict[item_id] = storage_dict
|
||||
runs[run_key]["results"][model_name][iteration_key] = iteration_dict
|
||||
save_json_file(runs, runs_file)
|
||||
|
||||
if raw_score is not None:
|
||||
logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
|
||||
else:
|
||||
logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
|
||||
with lock:
|
||||
iteration_dict[item_id] = {
|
||||
"error": str(e),
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
if "errors" not in runs[run_key]:
|
||||
runs[run_key]["errors"] = []
|
||||
runs[run_key]["errors"].append({
|
||||
"model": model_name,
|
||||
"iteration": iteration_key,
|
||||
"item_id": item_id,
|
||||
"error": str(e)
|
||||
})
|
||||
save_json_file(runs, runs_file)
|
||||
|
||||
def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict):
|
||||
"""
|
||||
Compute metrics for both raw and calibrated scores, including stability tests,
|
||||
normalized components, and detailed distributions.
|
||||
|
||||
Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
|
||||
"""
|
||||
run_data = runs[run_key]
|
||||
results = run_data.get("results", {})
|
||||
|
||||
# 1. Collect raw scores, compute calibration, store calibrated values
|
||||
raw_scores_by_model_all = defaultdict(list)
|
||||
raw_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
|
||||
calibrated_scores_by_model_all = defaultdict(list)
|
||||
calibrated_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
|
||||
lengths_by_model = {}
|
||||
|
||||
# -- Collect raw scores
|
||||
for model_name, iteration_data in results.items():
|
||||
if not isinstance(iteration_data, dict):
|
||||
continue
|
||||
|
||||
lengths = []
|
||||
for it_key, it_val in iteration_data.items():
|
||||
if it_key == "__model_stats__":
|
||||
continue
|
||||
if not isinstance(it_val, dict):
|
||||
continue
|
||||
|
||||
for item_id, item_info in it_val.items():
|
||||
if (isinstance(item_info, dict) and
|
||||
"aggregated_score_raw" in item_info):
|
||||
raw_score = item_info["aggregated_score_raw"]
|
||||
|
||||
# Collect raw score globally
|
||||
raw_scores_by_model_all[model_name].append(raw_score)
|
||||
# Collect raw score by iteration
|
||||
raw_scores_by_model_by_iter[model_name][it_key].append(raw_score)
|
||||
|
||||
# Track text length for analyzing
|
||||
text = (samples_data.get(model_name, {})
|
||||
.get("samples", {})
|
||||
.get(it_key, {})
|
||||
.get(item_id, ""))
|
||||
lengths.append(len(text))
|
||||
|
||||
if len(raw_scores_by_model_all[model_name]) > 0:
|
||||
lengths_by_model[model_name] = lengths
|
||||
|
||||
# 2. Distribution + calibration
|
||||
all_raw_scores = [s for scores in raw_scores_by_model_all.values() for s in scores]
|
||||
run_data["raw_score_distribution"] = compute_detailed_distribution(all_raw_scores)
|
||||
|
||||
calibration_config = build_landmark_calibration_config(all_raw_scores, [0, 3, 5, 7, 10])
|
||||
run_data["calibration_config"] = calibration_config
|
||||
|
||||
# Apply calibration
|
||||
for model_name, iteration_data in results.items():
|
||||
if not isinstance(iteration_data, dict):
|
||||
continue
|
||||
|
||||
# Flatten model's raw scores, calibrate them
|
||||
raw_list = raw_scores_by_model_all[model_name]
|
||||
calibrated = [apply_landmark_calibration(s, calibration_config) for s in raw_list]
|
||||
|
||||
# Re-walk iteration_data to assign each calibration back
|
||||
idx = 0
|
||||
for it_key, it_val in iteration_data.items():
|
||||
if it_key == "__model_stats__":
|
||||
continue
|
||||
if not isinstance(it_val, dict):
|
||||
continue
|
||||
for item_id, item_info in it_val.items():
|
||||
if (isinstance(item_info, dict) and
|
||||
"aggregated_score_raw" in item_info):
|
||||
item_info["aggregated_score_calibrated"] = calibrated[idx]
|
||||
idx += 1
|
||||
|
||||
# Update calibrated_scores_by_model_by_iter in the same breakdown
|
||||
idx2 = 0
|
||||
for it_key in raw_scores_by_model_by_iter[model_name]:
|
||||
count_for_iter = len(raw_scores_by_model_by_iter[model_name][it_key])
|
||||
these_cals = calibrated[idx2 : idx2 + count_for_iter]
|
||||
calibrated_scores_by_model_by_iter[model_name][it_key].extend(these_cals)
|
||||
idx2 += count_for_iter
|
||||
|
||||
# Populate the single flattened list of calibrated scores
|
||||
calibrated_scores_by_model_all[model_name].extend(calibrated)
|
||||
|
||||
# 3. Calibrated distributions
|
||||
all_calibrated_scores = [
|
||||
s for scores in calibrated_scores_by_model_all.values() for s in scores
|
||||
]
|
||||
run_data["calibrated_score_distribution"] = compute_detailed_distribution(all_calibrated_scores)
|
||||
|
||||
# 4. Model-level stats
|
||||
run_data["raw_model_stats"] = compute_model_level_stats(raw_scores_by_model_all, lengths_by_model)
|
||||
run_data["calibrated_model_stats"] = compute_model_level_stats(calibrated_scores_by_model_all, lengths_by_model)
|
||||
|
||||
# 5. Cross-model stats
|
||||
run_data["raw_cross_model_stats"] = compute_cross_model_stats(
|
||||
scores_by_model_all=raw_scores_by_model_all,
|
||||
scores_by_model_by_iter=raw_scores_by_model_by_iter
|
||||
)
|
||||
run_data["calibrated_cross_model_stats"] = compute_cross_model_stats(
|
||||
scores_by_model_all=calibrated_scores_by_model_all,
|
||||
scores_by_model_by_iter=calibrated_scores_by_model_by_iter
|
||||
)
|
||||
|
||||
# 6. Separability metrics
|
||||
compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
|
||||
compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
|
||||
|
||||
|
||||
# 8. Compute iteration stability for raw & calibrated
|
||||
compute_iteration_stability(run_data, label="raw")
|
||||
compute_iteration_stability(run_data, label="calibrated")
|
||||
random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
|
||||
random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
|
||||
logging.info("Score stability (RAW)")
|
||||
logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
|
||||
logging.info("Score stability (CALIBRATED)")
|
||||
logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
|
||||
f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
|
||||
|
||||
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
|
||||
|
||||
# -- (A) RAW Judgemark
|
||||
# Pull out raw stats + separability metrics
|
||||
raw_stats = run_data["raw_cross_model_stats"]
|
||||
raw_norm = raw_stats["normalized_components"] # "std_dev", "kw_stat", etc.
|
||||
|
||||
# Add your own normalization steps as needed
|
||||
raw_emd = run_data["separability_metrics"]["raw"]["emd"]["average"]
|
||||
raw_emd_norm = normalize(raw_emd, 0, 4)
|
||||
raw_overlap_mag = run_data["separability_metrics"]["raw"]["ci99_overlap_magnitude_sum"]
|
||||
raw_overlap_mag_norm = normalize(raw_overlap_mag, 0, 26, False)
|
||||
raw_norm["ci99_overlap_magnitude_sum_norm"] = raw_overlap_mag_norm
|
||||
|
||||
# Range of raw model means
|
||||
raw_score_range = (
|
||||
max(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
|
||||
- min(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
|
||||
)
|
||||
run_data["raw_score_range"] = raw_score_range
|
||||
raw_score_range_norm = normalize(raw_score_range, 0, 8)
|
||||
raw_norm["raw_score_range_norm"] = raw_score_range_norm
|
||||
|
||||
# Add Kendall's tau from the randomization-based stability measure
|
||||
raw_norm["kendall_tau_bootstrapped"] = normalize(random_tau_raw, 0.4, 1.0)
|
||||
|
||||
# compute an aggregated separability metric
|
||||
raw_separability = (
|
||||
raw_norm["std_dev"] # std deviation *between* models (separability)
|
||||
+ raw_norm["kw_stat"] # kruskal-wallis (separability)
|
||||
+ raw_norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
|
||||
+ raw_norm["raw_score_range_norm"] # range of assigned scores (separability)
|
||||
+ run_data["separability_metrics"]["raw"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
|
||||
+ raw_emd_norm # earth-movers distance (separability)
|
||||
) / 6.0
|
||||
|
||||
# Combine into final raw Judgemark
|
||||
final_score_raw = (
|
||||
raw_norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
|
||||
+ raw_norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
|
||||
+ 4 * raw_separability # aggregate of separability metrics
|
||||
) / 6.0
|
||||
run_data["final_judgemark_score_elements_raw"] = {
|
||||
"norm_stability_between_iterations": raw_norm["kendall_tau_bootstrapped"],
|
||||
"norm_correlation_with_lmsys_arena": raw_norm["kendall_tau"],
|
||||
"norm_std_dev_between_models": raw_norm["std_dev"],
|
||||
"norm_kruskall_wallis": raw_norm["kw_stat"],
|
||||
"norm_ci99_adjacent_overlap": raw_norm["ci99_overlap_magnitude_sum_norm"],
|
||||
"norm_score_range": raw_norm["raw_score_range_norm"],
|
||||
"norm_intra_model_ci95": run_data["separability_metrics"]["raw"]["modulated_ci95"],
|
||||
"norm_earth_movers_distance": raw_emd_norm
|
||||
}
|
||||
run_data["final_judgemark_score_raw"] = final_score_raw
|
||||
|
||||
# -- (B) Calibrated Judgemark
|
||||
cal_stats = run_data["calibrated_cross_model_stats"]
|
||||
norm = cal_stats["normalized_components"]
|
||||
|
||||
emd_norm = normalize(run_data["separability_metrics"]["calibrated"]["emd"]["average"], 0, 4)
|
||||
overlap_magnitude_norm = normalize(
|
||||
run_data["separability_metrics"]["calibrated"]["ci99_overlap_magnitude_sum"], 0, 26, False
|
||||
)
|
||||
norm["ci99_overlap_magnitude_sum_norm"] = overlap_magnitude_norm
|
||||
|
||||
# Range of calibrated model means
|
||||
calibrated_score_range = (
|
||||
max(run_data["calibrated_model_stats"][model]["mean"]
|
||||
for model in run_data["calibrated_model_stats"])
|
||||
- min(run_data["calibrated_model_stats"][model]["mean"]
|
||||
for model in run_data["calibrated_model_stats"])
|
||||
)
|
||||
run_data["calibrated_score_range"] = calibrated_score_range
|
||||
calibrated_score_range_norm = normalize(calibrated_score_range, 0, 8)
|
||||
norm["calibrated_score_range_norm"] = calibrated_score_range_norm
|
||||
|
||||
# Kendall's tau from the randomized stability measure
|
||||
norm["kendall_tau_bootstrapped"] = normalize(random_tau_cal, 0.4, 1.0)
|
||||
|
||||
# compute an aggregated separability metric
|
||||
calibrated_separability = (
|
||||
norm["std_dev"] # std deviation *between* models (separability)
|
||||
+ norm["kw_stat"] # kruskal-wallis (separability)
|
||||
+ norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
|
||||
+ norm["calibrated_score_range_norm"] # range of assigned scores (separability)
|
||||
+ run_data["separability_metrics"]["calibrated"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
|
||||
+ emd_norm # earth-movers distance (separability)
|
||||
) / 6.0
|
||||
|
||||
final_score_calibrated = (
|
||||
norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
|
||||
+ norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
|
||||
+ 4 * calibrated_separability # aggregate of separability metrics
|
||||
) / 6.0
|
||||
run_data["final_judgemark_score_elements_calibrated"] = {
|
||||
"norm_stability_between_iterations": norm["kendall_tau_bootstrapped"],
|
||||
"norm_correlation_with_lmsys_arena": norm["kendall_tau"],
|
||||
"norm_std_dev_between_models": norm["std_dev"],
|
||||
"norm_kruskall_wallis": norm["kw_stat"],
|
||||
"norm_ci99_adjacent_overlap": norm["ci99_overlap_magnitude_sum_norm"],
|
||||
"norm_score_range": norm["calibrated_score_range_norm"],
|
||||
"norm_intra_model_ci95": run_data["separability_metrics"]["calibrated"]["modulated_ci95"],
|
||||
"norm_earth_movers_distance": norm
|
||||
}
|
||||
run_data["final_judgemark_score"] = final_score_calibrated
|
||||
|
||||
# 10. Create visualizations + logs
|
||||
create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data)
|
||||
|
||||
log_score_summary(
|
||||
"RAW SCORES",
|
||||
run_data["raw_cross_model_stats"],
|
||||
run_data["raw_model_stats"]
|
||||
)
|
||||
log_score_summary(
|
||||
"CALIBRATED SCORES",
|
||||
run_data["calibrated_cross_model_stats"],
|
||||
run_data["calibrated_model_stats"]
|
||||
)
|
||||
|
||||
logging.info(f"Final Judgemark (raw) = {final_score_raw:.3f}")
|
||||
logging.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}")
|
||||
|
||||
|
||||
def sanitize_model_name(name: str) -> str:
|
||||
"""Sanitize judge model name for use in the run key."""
|
||||
return re.sub(r'[^a-zA-Z0-9_-]+', '_', name)
|
||||
|
||||
def run_judgemark_v2(
|
||||
judge_model: str,
|
||||
samples_file: str,
|
||||
prompts_file: str,
|
||||
runs_file: str,
|
||||
num_threads: int,
|
||||
run_id: str = None,
|
||||
save_raw_judge_output: bool = False
|
||||
) -> str:
|
||||
global executor, should_exit
|
||||
|
||||
logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
|
||||
runs = load_json_file(runs_file)
|
||||
|
||||
# Form the run key using run_id + "__" + sanitized judge model
|
||||
sanitized_jm = sanitize_model_name(judge_model)
|
||||
base_id = run_id if run_id else str(uuid.uuid4())
|
||||
run_key = f"{base_id}__{sanitized_jm}"
|
||||
|
||||
# Load data files
|
||||
samples_data = load_json_file(samples_file)
|
||||
judge_prompts = load_json_file(prompts_file)
|
||||
|
||||
# Initialize or get existing run data
|
||||
if run_key not in runs:
|
||||
runs[run_key] = {
|
||||
"judge_model": judge_model,
|
||||
"start_time": datetime.now().isoformat(),
|
||||
"status": "running",
|
||||
"samples_file": samples_file,
|
||||
"prompts_file": prompts_file,
|
||||
"results": {}
|
||||
}
|
||||
save_json_file(runs, runs_file)
|
||||
|
||||
run_data = runs[run_key]
|
||||
items_to_process = []
|
||||
|
||||
# If run exists, scan for items needing retry
|
||||
if "results" in run_data:
|
||||
results = run_data.get("results", {})
|
||||
|
||||
# Scan all possible items
|
||||
for model_name, model_info in samples_data.items():
|
||||
samples_dict = model_info.get("samples", {})
|
||||
for iteration_key, iteration_items in samples_dict.items():
|
||||
for item_id, item_text in iteration_items.items():
|
||||
# Check if this item needs processing
|
||||
existing_result = (results.get(model_name, {})
|
||||
.get(iteration_key, {})
|
||||
.get(item_id, {}))
|
||||
|
||||
needs_retry = (
|
||||
not existing_result or
|
||||
not existing_result.get("parsed_scores") or
|
||||
len(existing_result.get("parsed_scores", {})) < 10 or
|
||||
existing_result.get("aggregated_score_raw", 0.0) == 0.0 or
|
||||
"error" in existing_result
|
||||
)
|
||||
|
||||
if needs_retry:
|
||||
items_to_process.append({
|
||||
"model_name": model_name,
|
||||
"iteration_key": iteration_key,
|
||||
"item_id": item_id,
|
||||
"item_text": item_text,
|
||||
"prompt_template": judge_prompts.get(item_id, "")
|
||||
})
|
||||
|
||||
if items_to_process:
|
||||
logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
|
||||
else:
|
||||
logging.info(f"No items to process in existing run {run_key}")
|
||||
|
||||
else:
|
||||
# New run - process all items
|
||||
for model_name, model_info in samples_data.items():
|
||||
samples_dict = model_info.get("samples", {})
|
||||
for iteration_key, iteration_items in samples_dict.items():
|
||||
print(iteration_key)
|
||||
for item_id, item_text in iteration_items.items():
|
||||
items_to_process.append({
|
||||
"model_name": model_name,
|
||||
"iteration_key": iteration_key,
|
||||
"item_id": item_id,
|
||||
"item_text": item_text,
|
||||
"prompt_template": judge_prompts.get(item_id, "")
|
||||
})
|
||||
|
||||
# Ensure concurrency lock
|
||||
lock = threading.Lock()
|
||||
|
||||
# Process any items that need retrying
|
||||
all_futures = []
|
||||
try:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
|
||||
executor = exec_
|
||||
|
||||
if items_to_process:
|
||||
# Process all items (either retries or new run)
|
||||
for item in items_to_process:
|
||||
if should_exit:
|
||||
break
|
||||
|
||||
all_futures.append(
|
||||
executor.submit(
|
||||
process_sample,
|
||||
item["model_name"],
|
||||
item["iteration_key"],
|
||||
item["item_id"],
|
||||
item["item_text"],
|
||||
item["prompt_template"],
|
||||
run_key,
|
||||
runs,
|
||||
runs_file,
|
||||
lock,
|
||||
judge_model,
|
||||
save_raw_judge_output
|
||||
)
|
||||
)
|
||||
|
||||
# Display progress bar for tasks
|
||||
for f in tqdm(concurrent.futures.as_completed(all_futures),
|
||||
total=len(all_futures), desc="Judging", leave=True):
|
||||
if should_exit:
|
||||
break
|
||||
try:
|
||||
f.result()
|
||||
except Exception as exc:
|
||||
logging.error(f"Exception in worker thread: {exc}")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logging.warning("KeyboardInterrupt caught in main thread.")
|
||||
should_exit = True
|
||||
time.sleep(0.1)
|
||||
finally:
|
||||
# Mark run as interrupted or completed
|
||||
status = "interrupted" if should_exit else "completed"
|
||||
runs[run_key]["status"] = status
|
||||
runs[run_key]["end_time"] = datetime.now().isoformat()
|
||||
|
||||
if not should_exit:
|
||||
# Run stability test
|
||||
if False:
|
||||
run_stability_test(
|
||||
run_data, judge_model,
|
||||
judge_prompts, samples_data,
|
||||
runs, runs_file,
|
||||
lock, num_threads
|
||||
)
|
||||
# Compute final stats
|
||||
finalize_scores_and_compute_judgemark(runs, run_key, samples_data)
|
||||
|
||||
# Save final
|
||||
save_json_file(runs, runs_file)
|
||||
|
||||
if executor:
|
||||
logging.info("Shutting down executor")
|
||||
executor.shutdown(wait=False)
|
||||
executor = None
|
||||
|
||||
logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
|
||||
return run_key
|
||||
@@ -0,0 +1,275 @@
|
||||
import math
|
||||
import statistics
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
import re
|
||||
from config.constants import REFERENCE_MODEL_SCORES
|
||||
from utils.stats import normalize
|
||||
|
||||
def parse_scores(judge_model_response: str) -> Dict[str,float]:
|
||||
"""
|
||||
Extracts zero or more named numeric scores from a text using a simple Regex pattern:
|
||||
|
||||
<metric name>: <score>
|
||||
|
||||
The metric name can be any string without newlines or colons.
|
||||
The score can be a positive or negative float or integer.
|
||||
Example lines in the judge output might be:
|
||||
"Realism Score: 7.5"
|
||||
"Melodramatic: 2"
|
||||
"""
|
||||
scores = {}
|
||||
# Look for lines or statements like "Something: 3.5" or "Something Score 3.5"
|
||||
pattern = r'(.*?):\s*(?:Score\s+)?(-?\d+(?:\.\d+)?)'
|
||||
matches = re.findall(pattern, judge_model_response)
|
||||
for match in matches:
|
||||
metric_name = match[0].strip()
|
||||
numeric_val = float(match[1])
|
||||
scores[metric_name] = numeric_val
|
||||
return scores
|
||||
|
||||
def compute_raw_score(scores: Dict[str,float]) -> float:
|
||||
"""
|
||||
Given a dict of {criteria: numeric score}, compute a single raw score by adjusting
|
||||
negative-themed criteria by inverting them, then normalizing to 0-10 scale.
|
||||
"""
|
||||
valid_scores = {k: v for k, v in scores.items() if 0 <= v <= 10}
|
||||
|
||||
if len(valid_scores) < 10:
|
||||
return None
|
||||
|
||||
negative_markers = [
|
||||
"melodramatic", "shallow resolution", "unearned resolution",
|
||||
"simplistic moralizing", "shallow optimism", "forced optimism",
|
||||
"trite", "overwrought", "amateurish", "contrived", "uninspiring",
|
||||
"characters are too good", "incongruent ending positivity",
|
||||
"unearned transformations", "profundity over-reach",
|
||||
"amateurish descriptives", "clunky asides", "stilted dialogue",
|
||||
"tit-for-tat dialogue", "purple prose", "uncreative", "tell-don't-show",
|
||||
"weak dialogue", "meandering"
|
||||
]
|
||||
|
||||
sum_val = 0.0
|
||||
for criteria, val in valid_scores.items():
|
||||
crit_lower = criteria.lower().strip()
|
||||
if any(neg in crit_lower for neg in negative_markers):
|
||||
sum_val += (10 - val)
|
||||
else:
|
||||
sum_val += val
|
||||
|
||||
avg_val = sum_val / len(valid_scores)
|
||||
return round(avg_val, 2)
|
||||
|
||||
def confidence_interval_95(data: List[float]) -> float:
|
||||
"""
|
||||
Computes the 95% confidence interval for the mean using normal approximations:
|
||||
CI95 = 1.96 * (std / sqrt(n)), for n>30 or so.
|
||||
"""
|
||||
n = len(data)
|
||||
if n < 2:
|
||||
return 0.0
|
||||
mean_ = statistics.mean(data)
|
||||
stdev_ = statistics.pstdev(data) if n == 1 else statistics.stdev(data)
|
||||
ci95 = 1.96 * (stdev_ / math.sqrt(n))
|
||||
return ci95
|
||||
|
||||
def compute_detailed_distribution(scores):
|
||||
if not scores:
|
||||
return {}
|
||||
return {
|
||||
"count": len(scores),
|
||||
"min": round(min(scores), 3),
|
||||
"max": round(max(scores), 3),
|
||||
"mean": round(statistics.mean(scores), 3),
|
||||
"median": round(statistics.median(scores), 3),
|
||||
"stdev": round(statistics.stdev(scores) if len(scores) > 1 else 0.0, 3),
|
||||
"p10": round(float(np.percentile(scores, 10)), 3),
|
||||
"p25": round(float(np.percentile(scores, 25)), 3),
|
||||
"p75": round(float(np.percentile(scores, 75)), 3),
|
||||
"p90": round(float(np.percentile(scores, 90)), 3)
|
||||
}
|
||||
|
||||
def compute_model_level_stats(scores_by_model, lengths_by_model):
|
||||
model_stats = {}
|
||||
for model_name, scores in scores_by_model.items():
|
||||
lengths = lengths_by_model[model_name]
|
||||
stats = {
|
||||
"count": len(scores),
|
||||
"mean": statistics.mean(scores),
|
||||
"median": statistics.median(scores),
|
||||
"stdev": statistics.stdev(scores) if len(scores) > 1 else 0.0,
|
||||
"ci95": confidence_interval_95(scores),
|
||||
"min": min(scores),
|
||||
"max": max(scores)
|
||||
}
|
||||
|
||||
# Length correlation
|
||||
if len(lengths) == len(scores):
|
||||
corr, _ = scipy.stats.pearsonr(lengths, scores)
|
||||
stats["length_correlation"] = corr
|
||||
|
||||
model_stats[model_name] = stats
|
||||
return model_stats
|
||||
|
||||
def compute_cross_model_stats(scores_by_model_all, scores_by_model_by_iter):
|
||||
"""
|
||||
Computes cross-model stats. ANOVA, Kruskal, and std_dev_across_models
|
||||
are calculated over all scores. Pearson/Kendall correlations are now
|
||||
computed on a per-iteration basis and then averaged.
|
||||
"""
|
||||
# ANOVA/Kruskal portion remains over ALL model scores
|
||||
arrays = list(scores_by_model_all.values())
|
||||
f_stat, f_p = scipy.stats.f_oneway(*arrays)
|
||||
kw_stat, kw_p = scipy.stats.kruskal(*arrays)
|
||||
|
||||
# Standard deviation across model means (over entire distribution)
|
||||
model_means = [statistics.mean(scores) for scores in arrays]
|
||||
std_across_models = statistics.pstdev(model_means)
|
||||
|
||||
# --------------------
|
||||
# Compute correlation stats per iteration, then average
|
||||
# --------------------
|
||||
iteration_keys = set()
|
||||
for model, by_iter_dict in scores_by_model_by_iter.items():
|
||||
iteration_keys |= set(by_iter_dict.keys()) # union of all iteration keys
|
||||
|
||||
iteration_pearsons = []
|
||||
iteration_kendalls = []
|
||||
|
||||
for it_key in iteration_keys:
|
||||
# Gather means for each model in this iteration
|
||||
model_means_dict = {}
|
||||
for model, by_iter_dict in scores_by_model_by_iter.items():
|
||||
if it_key in by_iter_dict and len(by_iter_dict[it_key]) > 0:
|
||||
model_means_dict[model] = statistics.mean(by_iter_dict[it_key])
|
||||
|
||||
# Pair each model's iteration-mean with reference score
|
||||
ref_pairs = []
|
||||
for m, mean_val in model_means_dict.items():
|
||||
if m in REFERENCE_MODEL_SCORES:
|
||||
ref_pairs.append((mean_val, REFERENCE_MODEL_SCORES[m]))
|
||||
|
||||
# If enough models exist for correlation, compute it
|
||||
if len(ref_pairs) >= 2:
|
||||
means, refs = zip(*ref_pairs)
|
||||
p_r, _ = scipy.stats.pearsonr(means, refs)
|
||||
k_tau, _ = scipy.stats.kendalltau(means, refs)
|
||||
else:
|
||||
p_r, k_tau = (0.0, 0.0)
|
||||
|
||||
iteration_pearsons.append(p_r)
|
||||
iteration_kendalls.append(k_tau)
|
||||
|
||||
# Final correlation = average across iteration-level correlations
|
||||
if iteration_pearsons:
|
||||
pearson_r = statistics.mean(iteration_pearsons)
|
||||
else:
|
||||
pearson_r = 0.0
|
||||
|
||||
if iteration_kendalls:
|
||||
kendall_tau = statistics.mean(iteration_kendalls)
|
||||
else:
|
||||
kendall_tau = 0.0
|
||||
|
||||
return {
|
||||
"anova_f": f_stat,
|
||||
"anova_p": f_p,
|
||||
"kw_stat": kw_stat,
|
||||
"kw_p": kw_p,
|
||||
"std_dev_across_models": std_across_models,
|
||||
"pearson_r": pearson_r,
|
||||
"kendall_tau": kendall_tau,
|
||||
"normalized_components": {
|
||||
"pearson_r": normalize(pearson_r, 0.7, 1.0),
|
||||
"kendall_tau": normalize(kendall_tau, 0.1, 1.0),
|
||||
"anova_f": normalize(f_stat, 0.0, 350.0),
|
||||
"kw_stat": normalize(kw_stat, 0.0, 1500.0),
|
||||
"std_dev": normalize(std_across_models, 0.0, 2.2)
|
||||
}
|
||||
}
|
||||
|
||||
def build_landmark_calibration_config(scores, desired_points=None):
|
||||
"""
|
||||
Creates a piecewise-linear calibration from these raw distribution
|
||||
landmarks: [min, Q1, median, Q3, max]
|
||||
to the given desired_points, e.g. [0, 3, 5, 7, 10].
|
||||
Returns a dict describing how to transform future scores.
|
||||
"""
|
||||
if not scores or len(scores) < 2:
|
||||
# Degenerate case: no meaningful distribution
|
||||
return {
|
||||
"method": "piecewise_landmark",
|
||||
"in_landmarks": [],
|
||||
"out_landmarks": []
|
||||
}
|
||||
|
||||
if desired_points is None:
|
||||
desired_points = [0, 3, 5, 7, 10]
|
||||
|
||||
in_min = min(scores)
|
||||
in_q1 = float(np.percentile(scores, 25))
|
||||
in_med = float(statistics.median(scores))
|
||||
in_q3 = float(np.percentile(scores, 75))
|
||||
in_max = max(scores)
|
||||
|
||||
return {
|
||||
"method": "piecewise_landmark",
|
||||
"in_landmarks": [in_min, in_q1, in_med, in_q3, in_max],
|
||||
"out_landmarks": desired_points
|
||||
}
|
||||
|
||||
def apply_landmark_calibration(x, config):
|
||||
"""
|
||||
Apply the piecewise-linear transform defined by config:
|
||||
"in_landmarks" = [minVal, q1Val, medVal, q3Val, maxVal]
|
||||
"out_landmarks" = [outMin, outQ1, outMed, outQ3, outMax].
|
||||
If x is < min or > max, we extrapolate linearly beyond that segment.
|
||||
"""
|
||||
inL = config.get("in_landmarks", [])
|
||||
outL = config.get("out_landmarks", [])
|
||||
if len(inL) != 5 or len(outL) != 5:
|
||||
# Invalid or degenerate config => just return x unchanged
|
||||
return x
|
||||
|
||||
in_min, in_q1, in_med, in_q3, in_max = inL
|
||||
out_min, out_q1, out_med, out_q3, out_max = outL
|
||||
|
||||
def linear_map(val, old_lo, old_hi, new_lo, new_hi):
|
||||
if abs(old_hi - old_lo) < 1e-12:
|
||||
return new_lo
|
||||
frac = (val - old_lo) / (old_hi - old_lo)
|
||||
return new_lo + frac * (new_hi - new_lo)
|
||||
|
||||
# Determine which segment x belongs to:
|
||||
if x <= in_q1:
|
||||
# (in_min -> in_q1) -> (out_min -> out_q1), but possibly x < in_min => extrapolate
|
||||
return linear_map(x, in_min, in_q1, out_min, out_q1)
|
||||
elif x <= in_med:
|
||||
return linear_map(x, in_q1, in_med, out_q1, out_med)
|
||||
elif x <= in_q3:
|
||||
return linear_map(x, in_med, in_q3, out_med, out_q3)
|
||||
else:
|
||||
# (in_q3 -> in_max) -> (out_q3 -> out_max), possibly x > in_max => extrapolate
|
||||
return linear_map(x, in_q3, in_max, out_q3, out_max)
|
||||
|
||||
def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
|
||||
"""Log a readable summary of score statistics."""
|
||||
logging.info(f"\n------- {score_type} Summary -------")
|
||||
logging.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
|
||||
logging.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
|
||||
logging.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
|
||||
logging.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
|
||||
logging.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
|
||||
|
||||
logging.info("\nModel Scores:")
|
||||
sorted_models = sorted(
|
||||
model_stats.items(),
|
||||
key=lambda kv: kv[1]["mean"],
|
||||
reverse=True
|
||||
)
|
||||
for model, stats in sorted_models:
|
||||
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
|
||||
logging.info(line)
|
||||
logging.info("------------------------------------")
|
||||
@@ -0,0 +1,233 @@
|
||||
|
||||
import logging
|
||||
import math
|
||||
import statistics
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
from typing import Dict, List
|
||||
from utils.stats import normalize
|
||||
|
||||
try:
|
||||
from scipy.stats import wasserstein_distance
|
||||
HAS_WASSERSTEIN = True
|
||||
except ImportError:
|
||||
HAS_WASSERSTEIN = False
|
||||
|
||||
def cohen_d(scores1: List[float], scores2: List[float]) -> float:
|
||||
"""
|
||||
Compute Cohen's d for two sets of scores.
|
||||
d = (mean2 - mean1) / pooled_stdev
|
||||
"""
|
||||
if len(scores1) < 2 or len(scores2) < 2:
|
||||
return 0.0
|
||||
mean1, mean2 = statistics.mean(scores1), statistics.mean(scores2)
|
||||
var1, var2 = statistics.pvariance(scores1), statistics.pvariance(scores2)
|
||||
n1, n2 = len(scores1), len(scores2)
|
||||
pooled_var = ((n1 - 1)*var1 + (n2 - 1)*var2) / (n1 + n2 - 2)
|
||||
if pooled_var <= 1e-12:
|
||||
return 0.0
|
||||
d = (mean2 - mean1) / math.sqrt(pooled_var)
|
||||
return d
|
||||
|
||||
def ci_interval(scores: List[float], ci_level=0.99) -> tuple[float, float]:
|
||||
"""
|
||||
Compute mean ± z*(stdev/sqrt(n)) for the specified CI level.
|
||||
Returns (low, high).
|
||||
"""
|
||||
if len(scores) < 2:
|
||||
# trivial or empty
|
||||
mean_ = statistics.mean(scores) if len(scores) == 1 else 0.0
|
||||
return (mean_, mean_)
|
||||
mean_ = statistics.mean(scores)
|
||||
stdev_ = statistics.stdev(scores)
|
||||
n = len(scores)
|
||||
z = scipy.stats.norm.ppf(0.5 + ci_level/2.0) # ~2.575 for 99% CI
|
||||
half_width = z * (stdev_ / math.sqrt(n))
|
||||
return (mean_ - half_width, mean_ + half_width)
|
||||
|
||||
def ci_intervals_overlap(ci1: tuple[float, float], ci2: tuple[float, float]) -> bool:
|
||||
"""
|
||||
Returns True if two confidence intervals overlap.
|
||||
"""
|
||||
return not (ci1[1] < ci2[0] or ci2[1] < ci1[0])
|
||||
|
||||
def compute_distributions_distance(scores_by_model: Dict[str, List[float]]):
|
||||
"""
|
||||
Example EMD computation across all pairs, if you still want it.
|
||||
Returns an average distance plus each pair's distance.
|
||||
"""
|
||||
models = list(scores_by_model.keys())
|
||||
distances = {}
|
||||
sum_dist = 0.0
|
||||
pair_count = 0
|
||||
|
||||
for i in range(len(models)):
|
||||
for j in range(i+1, len(models)):
|
||||
mA, mB = models[i], models[j]
|
||||
d = -1.0
|
||||
if HAS_WASSERSTEIN and scores_by_model[mA] and scores_by_model[mB]:
|
||||
d = wasserstein_distance(scores_by_model[mA], scores_by_model[mB])
|
||||
distances[f"{mA}__{mB}"] = d
|
||||
if d >= 0.0:
|
||||
sum_dist += d
|
||||
pair_count += 1
|
||||
|
||||
avg_dist = (sum_dist / pair_count) if pair_count else 0.0
|
||||
return {
|
||||
"average": avg_dist,
|
||||
"pairs": distances
|
||||
}
|
||||
|
||||
def compute_average_ci95(model_scores: Dict[str, List[float]]) -> float:
|
||||
"""
|
||||
Compute the average 95% CI half-width across models.
|
||||
"""
|
||||
if not model_scores:
|
||||
return 0.0
|
||||
half_widths = []
|
||||
z95 = 1.96
|
||||
for scores in model_scores.values():
|
||||
if len(scores) < 2:
|
||||
half_widths.append(0.0)
|
||||
continue
|
||||
stdev_ = statistics.stdev(scores)
|
||||
mean_ = statistics.mean(scores)
|
||||
n = len(scores)
|
||||
hw = z95 * (stdev_ / math.sqrt(n))
|
||||
half_widths.append(hw)
|
||||
return statistics.mean(half_widths) if half_widths else 0.0
|
||||
|
||||
def compute_separability_metrics(
|
||||
run_data: dict,
|
||||
scores_by_model: Dict[str, List[float]],
|
||||
label: str = "raw",
|
||||
scale_factor: float = 1.5,
|
||||
) -> None:
|
||||
"""
|
||||
Compute a few custom “separability” metrics:
|
||||
• 99% CI overlap only for adjacent models (fraction)
|
||||
• The *magnitude* of 99% CI overlap between adjacent models (with optional scaling of intervals)
|
||||
• Single summary measure of Cohen’s d (e.g., average of absolute values)
|
||||
• EMD across pairs (optional)
|
||||
• Weighted or “modulated” metric for average CI95
|
||||
|
||||
Args:
|
||||
run_data: A dictionary to store results.
|
||||
scores_by_model: Dict of model -> list of scores.
|
||||
label: String label for grouping these results in run_data.
|
||||
scale_factor: If >1.0, intervals are expanded by that factor when
|
||||
computing overlap magnitude. For example, 1.5 means
|
||||
you increase the half-width of each interval by 50%.
|
||||
"""
|
||||
if "separability_metrics" not in run_data:
|
||||
run_data["separability_metrics"] = {}
|
||||
run_data["separability_metrics"][label] = {}
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 1) Basic stats: model means + 99% CI
|
||||
# ----------------------------------------------------------------
|
||||
model_means = {}
|
||||
model_ci99 = {}
|
||||
for m, sc in scores_by_model.items():
|
||||
if sc:
|
||||
model_means[m] = statistics.mean(sc)
|
||||
model_ci99[m] = ci_interval(sc, ci_level=0.99)
|
||||
else:
|
||||
model_means[m] = 0.0
|
||||
model_ci99[m] = (0.0, 0.0)
|
||||
|
||||
# Sort models by mean descending
|
||||
models_sorted = sorted(model_means.keys(), key=lambda x: model_means[x], reverse=True)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 2) Original “adjacent overlap fraction” (no scaling)
|
||||
# ----------------------------------------------------------------
|
||||
adjacent_overlap = {}
|
||||
overlap_count = 0
|
||||
for i in range(len(models_sorted) - 1):
|
||||
mA, mB = models_sorted[i], models_sorted[i + 1]
|
||||
overlap = ci_intervals_overlap(model_ci99[mA], model_ci99[mB])
|
||||
adjacent_overlap[f"{mA}__{mB}"] = overlap
|
||||
if overlap:
|
||||
overlap_count += 1
|
||||
|
||||
adj_frac_overlap = overlap_count / (len(models_sorted) - 1) if len(models_sorted) > 1 else 0.0
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 3) “Magnitude” of 99% CI overlap between adjacent models
|
||||
# with optional scaling factor
|
||||
# ----------------------------------------------------------------
|
||||
def scale_interval(ci: tuple[float, float], factor: float) -> tuple[float, float]:
|
||||
"""
|
||||
Given an interval (low, high), expand it about its midpoint by 'factor'.
|
||||
E.g. if factor=1.5, the half-width becomes 1.5 * (original half-width).
|
||||
"""
|
||||
low, high = ci
|
||||
mid = (low + high) / 2.0
|
||||
half_width = (high - low) / 2.0
|
||||
new_half = factor * half_width
|
||||
return (mid - new_half, mid + new_half)
|
||||
|
||||
def interval_overlap(ciA: tuple[float, float], ciB: tuple[float, float]) -> float:
|
||||
"""Return the length of the overlap between two intervals."""
|
||||
return max(0.0, min(ciA[1], ciB[1]) - max(ciA[0], ciB[0]))
|
||||
|
||||
adjacent_overlap_magnitude = {}
|
||||
sum_overlap_magnitude = 0.0
|
||||
for i in range(len(models_sorted) - 1):
|
||||
mA, mB = models_sorted[i], models_sorted[i + 1]
|
||||
# Scale each interval before computing overlap
|
||||
scaledA = scale_interval(model_ci99[mA], scale_factor)
|
||||
scaledB = scale_interval(model_ci99[mB], scale_factor)
|
||||
overlap_mag = interval_overlap(scaledA, scaledB)
|
||||
adjacent_overlap_magnitude[f"{mA}__{mB}"] = overlap_mag
|
||||
sum_overlap_magnitude += overlap_mag
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 4) Single measure for Cohen’s d (average of absolute Cohen’s d across adjacent pairs)
|
||||
# ----------------------------------------------------------------
|
||||
d_vals = []
|
||||
for i in range(len(models_sorted) - 1):
|
||||
mA, mB = models_sorted[i], models_sorted[i + 1]
|
||||
d_val = cohen_d(scores_by_model[mA], scores_by_model[mB])
|
||||
d_vals.append(abs(d_val))
|
||||
avg_cohens_d = sum(d_vals) / len(d_vals) if d_vals else 0.0
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 5) Optional EMD across all pairs
|
||||
# ----------------------------------------------------------------
|
||||
emd_data = compute_distributions_distance(scores_by_model)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 6) Weighted or modulated average CI95
|
||||
# ----------------------------------------------------------------
|
||||
avg_ci95 = compute_average_ci95(scores_by_model)
|
||||
norm_ci95 = normalize(avg_ci95, 0.15, 0.45, False)
|
||||
norm_cohens_d = normalize(avg_cohens_d, 0, 0.4)
|
||||
modulated_ci95 = norm_ci95 * norm_cohens_d
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# Store or log results
|
||||
# ----------------------------------------------------------------
|
||||
metrics_label = run_data["separability_metrics"][label]
|
||||
metrics_label["ci99_overlap_adjacent"] = adjacent_overlap
|
||||
metrics_label["adjacent_overlap_fraction"] = adj_frac_overlap
|
||||
|
||||
# New overlap magnitude stats (with scaling)
|
||||
metrics_label["ci99_overlap_magnitude_adjacent"] = adjacent_overlap_magnitude
|
||||
metrics_label["ci99_overlap_magnitude_sum"] = sum_overlap_magnitude
|
||||
metrics_label["ci99_overlap_scale_factor"] = scale_factor
|
||||
|
||||
metrics_label["average_cohens_d_adjacent"] = avg_cohens_d
|
||||
metrics_label["emd"] = emd_data
|
||||
metrics_label["average_ci95"] = avg_ci95
|
||||
metrics_label["modulated_ci95"] = modulated_ci95
|
||||
|
||||
# Logging summary
|
||||
logging.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
|
||||
logging.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
|
||||
logging.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
|
||||
f"{sum_overlap_magnitude:.3f}")
|
||||
logging.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
|
||||
logging.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
|
||||
logging.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
|
||||
@@ -0,0 +1,448 @@
|
||||
import logging
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from tqdm import tqdm
|
||||
from typing import Dict
|
||||
|
||||
from utils.api import send_to_judge_model
|
||||
from utils.file_io import save_json_file
|
||||
from core.scoring import parse_scores, compute_raw_score
|
||||
from config.constants import STABILITY_ITEMS, STABILITY_REPS
|
||||
from utils.state import should_exit, executor
|
||||
from collections import defaultdict
|
||||
import statistics
|
||||
import math
|
||||
import scipy.stats
|
||||
import random
|
||||
import statistics
|
||||
|
||||
def extract_model_item_scores(run_data: dict, label: str = "raw"):
|
||||
"""
|
||||
Collect a dictionary:
|
||||
model_item_scores[model][item_id] = { iteration_key: score }
|
||||
for either aggregated_score_raw or aggregated_score_calibrated (controlled by 'label').
|
||||
|
||||
Also returns a sorted list of iteration_keys found in the data.
|
||||
We assume (model, item_id) either has exactly one score per iteration or is skipped.
|
||||
"""
|
||||
results = run_data.get("results", {})
|
||||
model_item_scores = defaultdict(lambda: defaultdict(dict))
|
||||
all_iteration_keys = set()
|
||||
|
||||
for model_name, model_dict in results.items():
|
||||
if not isinstance(model_dict, dict):
|
||||
continue
|
||||
for it_key, item_dict in model_dict.items():
|
||||
if not isinstance(item_dict, dict):
|
||||
continue
|
||||
all_iteration_keys.add(it_key)
|
||||
for item_id, item_info in item_dict.items():
|
||||
if not isinstance(item_info, dict):
|
||||
continue
|
||||
if label == "raw":
|
||||
score_val = item_info.get("aggregated_score_raw", None)
|
||||
else:
|
||||
score_val = item_info.get("aggregated_score_calibrated", None)
|
||||
|
||||
if isinstance(score_val, (int, float)):
|
||||
model_item_scores[model_name][item_id][it_key] = score_val
|
||||
|
||||
sorted_iteration_keys = sorted(list(all_iteration_keys))
|
||||
return model_item_scores, sorted_iteration_keys
|
||||
|
||||
|
||||
def shuffle_iteration_assignments_by_item(
|
||||
model_item_scores,
|
||||
iteration_keys
|
||||
):
|
||||
"""
|
||||
For each model, for each item_id, we have a dict of {it_key: score}, one entry per iteration.
|
||||
We do a random permutation among iteration_keys so that:
|
||||
|
||||
For k in range(len(iteration_keys)):
|
||||
* The score originally from iteration_keys[perm[k]] is assigned to iteration_keys[k].
|
||||
|
||||
This ensures each iteration_key gets exactly one sample for each item_id,
|
||||
but “which original iteration’s score” is placed there is random.
|
||||
|
||||
Returns: iteration_averages = { new_it_key: { model: [scores...] } }
|
||||
|
||||
We'll later convert those lists to mean(...) for each model.
|
||||
"""
|
||||
# structure to gather new assignment
|
||||
# iteration_averages[new_it_key][model] = list of assigned scores
|
||||
from collections import defaultdict
|
||||
iteration_averages = defaultdict(lambda: defaultdict(list))
|
||||
n_iters = len(iteration_keys)
|
||||
|
||||
for model_name, item_dict in model_item_scores.items():
|
||||
for item_id, it_score_map in item_dict.items():
|
||||
# We expect one score for each iteration key
|
||||
# If item_id is missing an iteration, that item won't be shuffled properly.
|
||||
# We can skip or handle partial if needed. Here, assume it's complete:
|
||||
if len(it_score_map) < n_iters:
|
||||
# skip incomplete
|
||||
continue
|
||||
|
||||
# Build an array of scores in sorted iteration_keys order
|
||||
# so scores[i] corresponds to iteration_keys[i] in the original
|
||||
scores_in_order = [it_score_map[k] for k in iteration_keys]
|
||||
|
||||
# random permutation of [0..n_iters-1]
|
||||
permutation = random.sample(range(n_iters), k=n_iters)
|
||||
|
||||
# Now assign scores_in_order[permutation[k]] to iteration_keys[k].
|
||||
for k in range(n_iters):
|
||||
new_it_key = iteration_keys[k]
|
||||
old_index = permutation[k]
|
||||
assigned_score = scores_in_order[old_index]
|
||||
iteration_averages[new_it_key][model_name].append(assigned_score)
|
||||
|
||||
return iteration_averages
|
||||
|
||||
|
||||
def compute_average_iteration_averages(iteration_averages):
|
||||
"""
|
||||
iteration_averages is { new_it_key: { model_name: [scores...] }}
|
||||
Convert that to { new_it_key: { model_name: mean_score }}.
|
||||
"""
|
||||
final_avg = {}
|
||||
for it_key, model_dict in iteration_averages.items():
|
||||
final_avg[it_key] = {}
|
||||
for m, score_list in model_dict.items():
|
||||
if score_list:
|
||||
final_avg[it_key][m] = statistics.mean(score_list)
|
||||
else:
|
||||
final_avg[it_key][m] = 0.0
|
||||
return final_avg
|
||||
|
||||
|
||||
def compute_kendall_tau_for_iterations(iteration_averages):
|
||||
"""
|
||||
iteration_averages: { it_key: { model_name: avg_score } }
|
||||
We'll build rank lists, then compare each pair of iteration_keys for Kendall’s τ.
|
||||
Returns average τ across iteration pairs, or 0.0 if not enough data.
|
||||
"""
|
||||
all_iter_keys = sorted(iteration_averages.keys())
|
||||
|
||||
# Build rank ordering
|
||||
iteration_rank = {}
|
||||
for it_key in all_iter_keys:
|
||||
items = list(iteration_averages[it_key].items()) # (model, avg_score)
|
||||
# sort descending by score
|
||||
items.sort(key=lambda x: x[1], reverse=True)
|
||||
iteration_rank[it_key] = [m for (m, sc) in items]
|
||||
|
||||
pairwise_taus = []
|
||||
for i in range(len(all_iter_keys)):
|
||||
for j in range(i+1, len(all_iter_keys)):
|
||||
ik = all_iter_keys[i]
|
||||
jk = all_iter_keys[j]
|
||||
rank_i = iteration_rank[ik]
|
||||
rank_j = iteration_rank[jk]
|
||||
common_models = set(rank_i).intersection(rank_j)
|
||||
if len(common_models) < 2:
|
||||
continue
|
||||
pos_i = {m: rank_i.index(m) for m in common_models}
|
||||
pos_j = {m: rank_j.index(m) for m in common_models}
|
||||
x = []
|
||||
y = []
|
||||
for m in common_models:
|
||||
x.append(pos_i[m])
|
||||
y.append(pos_j[m])
|
||||
if len(x) > 1:
|
||||
tau, pval = scipy.stats.kendalltau(x, y)
|
||||
if not math.isnan(tau):
|
||||
pairwise_taus.append(tau)
|
||||
if pairwise_taus:
|
||||
return statistics.mean(pairwise_taus)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def compute_randomized_iteration_rank_stability_by_item(
|
||||
run_data: dict,
|
||||
label: str = "raw",
|
||||
n_shuffles: int = 1000
|
||||
) -> float:
|
||||
"""
|
||||
For each item_id in each model, we shuffle which iteration key
|
||||
receives that item’s original score (a random permutation among
|
||||
the iteration_keys).
|
||||
|
||||
This ensures that each iteration_key still has exactly one sample for each item
|
||||
(thus preserving the structure of “N iterations, M items each”),
|
||||
but randomizes *which original iteration’s score* ended up in each iteration.
|
||||
|
||||
Then we compute the average Kendall's τ across iteration pairs
|
||||
(ranking stability). We repeat n_shuffles times, returning the
|
||||
average of those τ values.
|
||||
|
||||
We'll store the result in:
|
||||
run_data["iteration_stability"][label]["randomized_average_kendall_tau_by_item"].
|
||||
|
||||
This approach is more efficient than reconstructing big data structures
|
||||
or re-running a large function every time.
|
||||
"""
|
||||
model_item_scores, iteration_keys = extract_model_item_scores(run_data, label=label)
|
||||
|
||||
if len(iteration_keys) < 2:
|
||||
return 0.0 # no real iteration comparison
|
||||
|
||||
tau_values = []
|
||||
for _ in range(n_shuffles):
|
||||
# Step 1: Randomly assign each item’s scores to iteration_keys
|
||||
iteration_averages_shuffled = shuffle_iteration_assignments_by_item(model_item_scores, iteration_keys)
|
||||
# Step 2: Convert from [list of scores] to mean
|
||||
final_avg = compute_average_iteration_averages(iteration_averages_shuffled)
|
||||
# Step 3: Compute rank correlation across iteration_keys
|
||||
tau = compute_kendall_tau_for_iterations(final_avg)
|
||||
tau_values.append(tau)
|
||||
|
||||
if tau_values:
|
||||
randomized_avg_tau = statistics.mean(tau_values)
|
||||
else:
|
||||
randomized_avg_tau = 0.0
|
||||
|
||||
# Store in run_data
|
||||
if "iteration_stability" not in run_data:
|
||||
run_data["iteration_stability"] = {}
|
||||
if label not in run_data["iteration_stability"]:
|
||||
run_data["iteration_stability"][label] = {}
|
||||
run_data["iteration_stability"][label]["randomized_average_kendall_tau_by_item"] = randomized_avg_tau
|
||||
|
||||
return randomized_avg_tau
|
||||
|
||||
|
||||
def compute_iteration_stability(run_data: dict, label="raw"):
|
||||
"""
|
||||
Example function that:
|
||||
1) Gathers iteration-average scores for each model.
|
||||
2) Computes 'scoring stability' across iterations.
|
||||
3) Computes 'ranking stability' across iterations.
|
||||
4) Stores results in run_data["iteration_stability"][label].
|
||||
|
||||
label could be "raw" or "calibrated"—adjust as needed.
|
||||
"""
|
||||
|
||||
# Make a place to store results:
|
||||
if "iteration_stability" not in run_data:
|
||||
run_data["iteration_stability"] = {}
|
||||
run_data["iteration_stability"][label] = {}
|
||||
|
||||
results = run_data.get("results", {})
|
||||
|
||||
# 1) Gather iteration_averages[iter_key][model] = average_score
|
||||
iteration_averages = defaultdict(dict) # iter_key -> {model -> avg_score}
|
||||
|
||||
for model_name, model_data in results.items():
|
||||
# model_data: iter_key -> item_dict
|
||||
if not isinstance(model_data, dict):
|
||||
continue
|
||||
for iter_key, item_dict in model_data.items():
|
||||
if not isinstance(item_dict, dict):
|
||||
continue
|
||||
|
||||
# Collect all items' scores for (this model, iter_key)
|
||||
scores = []
|
||||
for item_id, item_info in item_dict.items():
|
||||
if not isinstance(item_info, dict):
|
||||
continue
|
||||
if label == "raw":
|
||||
val = item_info.get("aggregated_score_raw", None)
|
||||
else:
|
||||
val = item_info.get("aggregated_score_calibrated", None)
|
||||
|
||||
if isinstance(val, (int, float)):
|
||||
scores.append(val)
|
||||
|
||||
if scores:
|
||||
iteration_averages[iter_key].setdefault(model_name, [])
|
||||
iteration_averages[iter_key][model_name] = statistics.mean(scores)
|
||||
|
||||
# 2) Scoring Stability
|
||||
# We can do something like: for each model, gather all iteration_averages and store stdev
|
||||
model_stability = {}
|
||||
|
||||
# We also want to track how many times each model appears (some might not appear in all iter_keys)
|
||||
all_iter_keys = sorted(iteration_averages.keys())
|
||||
|
||||
for model_name in results.keys():
|
||||
# gather the iteration means
|
||||
vals = []
|
||||
for it_key in all_iter_keys:
|
||||
if model_name in iteration_averages[it_key]:
|
||||
vals.append(iteration_averages[it_key][model_name])
|
||||
if len(vals) > 1:
|
||||
stdev_ = statistics.pstdev(vals) if len(vals) > 1 else 0.0
|
||||
model_stability[model_name] = {
|
||||
"mean_iter_score": statistics.mean(vals),
|
||||
"iteration_count": len(vals),
|
||||
"stdev_across_iters": stdev_
|
||||
}
|
||||
else:
|
||||
model_stability[model_name] = {
|
||||
"mean_iter_score": vals[0] if vals else 0.0,
|
||||
"iteration_count": len(vals),
|
||||
"stdev_across_iters": 0.0
|
||||
}
|
||||
|
||||
run_data["iteration_stability"][label]["scoring_stability"] = model_stability
|
||||
|
||||
# 3) Ranking Stability
|
||||
# For each iteration, produce a list of (model, iteration_average)
|
||||
# Then create a rank ordering. We'll store them in a dictionary: iteration_rank[iter_key] = [model1, model2, ...]
|
||||
|
||||
iteration_rank = {}
|
||||
for iter_key in all_iter_keys:
|
||||
# build a list of (model, avg_score) then sort descending
|
||||
items = list(iteration_averages[iter_key].items())
|
||||
items.sort(key=lambda x: x[1], reverse=True)
|
||||
# The rank order is just model names in sorted order:
|
||||
iteration_rank[iter_key] = [m for (m, sc) in items]
|
||||
|
||||
# now for each pair (i, j) of iteration_keys, compute rank correlation
|
||||
# you can pick Kendall or Spearman. We'll do Kendall as example:
|
||||
|
||||
pairwise_rank_corr = {}
|
||||
sorted_iter_keys = list(all_iter_keys)
|
||||
|
||||
for i in range(len(sorted_iter_keys)):
|
||||
for j in range(i+1, len(sorted_iter_keys)):
|
||||
ik = sorted_iter_keys[i]
|
||||
jk = sorted_iter_keys[j]
|
||||
|
||||
rank_i = iteration_rank[ik] # not iteration_rank[i]
|
||||
rank_j = iteration_rank[jk] # not iteration_rank[j]
|
||||
# build positions
|
||||
# If a model is missing in either iteration, skip it
|
||||
common_models = set(rank_i).intersection(rank_j)
|
||||
# create index mappings
|
||||
pos_i = {m: rank_i.index(m) for m in common_models}
|
||||
pos_j = {m: rank_j.index(m) for m in common_models}
|
||||
|
||||
# now build x/y from pos_i, pos_j
|
||||
x = []
|
||||
y = []
|
||||
for m in common_models:
|
||||
x.append(pos_i[m])
|
||||
y.append(pos_j[m])
|
||||
|
||||
if len(x) > 1:
|
||||
tau, pval = scipy.stats.kendalltau(x, y)
|
||||
else:
|
||||
tau, pval = (0.0, 1.0)
|
||||
|
||||
key_name = f"{ik}__vs__{jk}"
|
||||
pairwise_rank_corr[key_name] = {
|
||||
"common_model_count": len(common_models),
|
||||
"kendall_tau": tau,
|
||||
"p_value": pval
|
||||
}
|
||||
|
||||
run_data["iteration_stability"][label]["ranking_stability"] = {
|
||||
"pairwise_correlation": pairwise_rank_corr
|
||||
}
|
||||
|
||||
# Optionally, you might compute an "average" or "median" rank correlation across all iteration pairs
|
||||
if pairwise_rank_corr:
|
||||
all_taus = [v["kendall_tau"] for v in pairwise_rank_corr.values() if not math.isnan(v["kendall_tau"])]
|
||||
run_data["iteration_stability"][label]["ranking_stability"]["average_kendall_tau"] = statistics.mean(all_taus) if all_taus else 0.0
|
||||
|
||||
def process_stability_test_item(model_name, iteration_key, item_id, item_text, prompt_template, judge_model) -> float:
|
||||
"""Process a single judge request for the stability test. Returns the aggregated score or 0.0 if failed."""
|
||||
global should_exit
|
||||
if should_exit:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
|
||||
final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")
|
||||
|
||||
messages = [{"role": "user", "content": final_prompt}]
|
||||
judge_response = send_to_judge_model(messages, judge_model=judge_model)
|
||||
|
||||
extracted_scores = parse_scores(judge_response)
|
||||
item_score = compute_raw_score(extracted_scores)
|
||||
|
||||
# Only return actual valid scores, never None
|
||||
return item_score if isinstance(item_score, (int, float)) and item_score > 0.0 else 0.0
|
||||
except Exception as e:
|
||||
logging.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}")
|
||||
return 0.0
|
||||
|
||||
def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, runs_file, lock, num_threads):
|
||||
"""Run stability test, retrying any missing entries to reach STABILITY_REPS per item."""
|
||||
logging.info("Running stability test for selected items...")
|
||||
|
||||
if "stability_test_results" not in run_data:
|
||||
run_data["stability_test_results"] = {}
|
||||
|
||||
items_to_process = []
|
||||
for (model, iteration, item_id) in STABILITY_ITEMS:
|
||||
key_name = f"{model}-{iteration}-{item_id}"
|
||||
existing_results = run_data["stability_test_results"].get(key_name, [])
|
||||
|
||||
# Filter out failed results (0.0 scores, None values) from existing
|
||||
valid_results = [score for score in existing_results
|
||||
if isinstance(score, (int, float)) and score > 0.0]
|
||||
needed_count = STABILITY_REPS - len(valid_results)
|
||||
|
||||
if needed_count > 0:
|
||||
item_text = samples_data.get(model, {}).get("samples", {}).get(iteration, {}).get(item_id, "")
|
||||
prompt_template = judge_prompts.get(item_id, "")
|
||||
|
||||
for _ in range(needed_count):
|
||||
items_to_process.append({
|
||||
"model": model,
|
||||
"iteration": iteration,
|
||||
"item_id": item_id,
|
||||
"item_text": item_text,
|
||||
"prompt_template": prompt_template,
|
||||
"key_name": key_name
|
||||
})
|
||||
|
||||
logging.info(f"Need {needed_count} more stability test results for {key_name}")
|
||||
|
||||
# Clean up existing results, keeping only valid scores
|
||||
run_data["stability_test_results"][key_name] = valid_results
|
||||
|
||||
if not items_to_process:
|
||||
logging.info("All stability test items already have complete results")
|
||||
return
|
||||
|
||||
with ThreadPoolExecutor(max_workers=num_threads) as exec_:
|
||||
futures_to_items = {}
|
||||
|
||||
# Launch futures for all needed retries
|
||||
for item in items_to_process:
|
||||
if should_exit:
|
||||
break
|
||||
future = exec_.submit(
|
||||
process_stability_test_item,
|
||||
item["model"], item["iteration"], item["item_id"],
|
||||
item["item_text"], item["prompt_template"],
|
||||
judge_model
|
||||
)
|
||||
futures_to_items[future] = item
|
||||
|
||||
# Collect results as they complete
|
||||
for future in tqdm(concurrent.futures.as_completed(futures_to_items.keys()),
|
||||
total=len(futures_to_items),
|
||||
desc="Stability Test"):
|
||||
if should_exit:
|
||||
break
|
||||
try:
|
||||
score = future.result()
|
||||
item = futures_to_items[future]
|
||||
key_name = item["key_name"]
|
||||
|
||||
# Only store actual valid scores
|
||||
if isinstance(score, (int, float)) and score > 0.0:
|
||||
with lock:
|
||||
run_data["stability_test_results"][key_name].append(score)
|
||||
save_json_file(runs, runs_file)
|
||||
else:
|
||||
logging.warning(f"Got invalid score for stability item {key_name}, will need retry")
|
||||
except Exception as exc:
|
||||
logging.error(f"Exception in stability test: {exc}")
|
||||
@@ -0,0 +1,125 @@
|
||||
import sys
|
||||
import signal
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
|
||||
from utils.logging_setup import setup_logging, get_verbosity
|
||||
from utils.file_io import load_json_file
|
||||
from core.benchmark import run_judgemark_v2
|
||||
from utils.api import API_KEY
|
||||
from utils.state import should_exit, executor
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""Handle interrupt signals (SIGINT, SIGTERM)."""
|
||||
global executor, should_exit
|
||||
print(f"\n[DEBUG] Signal {signum} caught!")
|
||||
logging.warning("Signal handler called")
|
||||
should_exit = True
|
||||
time.sleep(0.1) # Give workers a moment to see the flag
|
||||
if executor:
|
||||
logging.info("Shutting down executor from signal handler")
|
||||
executor.shutdown(wait=False)
|
||||
logging.info("Executor shutdown complete")
|
||||
sys.exit(1)
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Run Judgemark-v2 Benchmark')
|
||||
parser.add_argument(
|
||||
'--judge-model',
|
||||
required=True,
|
||||
help='Judge model identifier (e.g., openai/gpt-4)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--samples-file',
|
||||
default="data/judgemark_v2.1_samples.json",
|
||||
help='JSON file containing pre-generated samples from various writer models'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--prompts-file',
|
||||
default="data/judge_prompts.json",
|
||||
help='JSON file containing the partial judge prompts to be filled with test responses'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--runs-file',
|
||||
default="judgemark_v2_runs.json",
|
||||
help='Path to store the Judgemark run results'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--run-id',
|
||||
help='Resume (or create) a run using this base ID, to be combined with the judge model name'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--threads',
|
||||
type=int,
|
||||
default=6,
|
||||
help='Number of threads to use'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbosity',
|
||||
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
||||
help='Set the logging verbosity level'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--num-runs',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of benchmark runs to execute'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--save-raw-judge-output',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='If set, store the raw judge model output in the results JSON (default: false)'
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Register signal handlers
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
# Reset sentinel
|
||||
should_exit = False
|
||||
|
||||
# Parse args
|
||||
args = parse_args()
|
||||
|
||||
# Setup logging
|
||||
verbosity = get_verbosity(args.verbosity)
|
||||
setup_logging(verbosity)
|
||||
logging.debug("Logging initialized")
|
||||
|
||||
# Check that we have an API key for the judge model
|
||||
if not API_KEY:
|
||||
logging.critical("No OPENAI_API_KEY found in environment variables.")
|
||||
raise ValueError("OPENAI_API_KEY not found in environment variables.")
|
||||
|
||||
run_ids = []
|
||||
for i in range(1, args.num_runs + 1):
|
||||
if should_exit:
|
||||
break
|
||||
logging.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}")
|
||||
rid = run_judgemark_v2(
|
||||
judge_model=args.judge_model,
|
||||
samples_file=args.samples_file,
|
||||
prompts_file=args.prompts_file,
|
||||
runs_file=args.runs_file,
|
||||
num_threads=args.threads,
|
||||
run_id=args.run_id,
|
||||
save_raw_judge_output=args.save_raw_judge_output
|
||||
)
|
||||
run_ids.append(rid)
|
||||
|
||||
# Finally, print summary
|
||||
runs = load_json_file(args.runs_file)
|
||||
logging.info("\nAll Judgemark-v2 runs completed:")
|
||||
print("\nAll Judgemark-v2 runs completed:")
|
||||
for rid in run_ids:
|
||||
rd = runs.get(rid, {})
|
||||
final_score = rd.get("final_judgemark_score", "N/A")
|
||||
logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
|
||||
print(f"Run ID: {rid}")
|
||||
print(f"Final Judgemark-v2 Score: {final_score}")
|
||||
@@ -0,0 +1,3 @@
|
||||
matplotlib>=3.7
|
||||
transformers>=4.26
|
||||
scipy>=1.10
|
||||
|
After Width: | Height: | Size: 298 KiB |
|
After Width: | Height: | Size: 298 KiB |
|
After Width: | Height: | Size: 293 KiB |
|
After Width: | Height: | Size: 299 KiB |
|
After Width: | Height: | Size: 295 KiB |
|
After Width: | Height: | Size: 298 KiB |
|
After Width: | Height: | Size: 293 KiB |
|
After Width: | Height: | Size: 299 KiB |
|
After Width: | Height: | Size: 297 KiB |
|
After Width: | Height: | Size: 292 KiB |
|
After Width: | Height: | Size: 296 KiB |
|
After Width: | Height: | Size: 291 KiB |
|
After Width: | Height: | Size: 301 KiB |
|
After Width: | Height: | Size: 297 KiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.0 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 1.0 MiB |
@@ -0,0 +1,16 @@
|
||||
model,judgemark_score,judgemark_score_raw,kendall_tau_bootstrapped,std_dev,kw_stat,ci99_overlap_magnitude_sum_norm,calibrated_score_range_norm,modulated_ci95,emd_norm
|
||||
ministral/Ministral-3b-instruct,7.5,15.06,0.0,0.099,0.013,0.094,0.858,0.012,0.363
|
||||
gpt-4o-mini,44.05,44.35,0.628,0.438,0.298,0.324,3.149,0.109,1.187
|
||||
deepseek-ai/deepseek-r1,76.97,71.77,0.863,0.821,0.905,0.683,6.098,0.6,2.195
|
||||
claude-3.5-haiku-20241022,77.64,76.88,0.898,0.832,0.891,0.665,5.975,0.575,2.229
|
||||
claude-3.5-sonnet-20240620,78.84,74.72,0.905,0.83,0.902,0.664,6.255,0.591,2.232
|
||||
microsoft/wizardlm-2-8x22b,55.42,52.86,0.749,0.566,0.463,0.417,4.62,0.198,1.53
|
||||
gpt-4o-2024-11-20,82.51,77.75,0.89,0.837,0.985,0.75,6.233,0.814,2.228
|
||||
gemini-2.0-flash-thinking-exp-1219,71.79,66.1,0.818,0.765,0.817,0.631,5.519,0.453,2.057
|
||||
deepseek-ai/deepseek-r1-distill-llama-70b,59.67,56.86,0.764,0.642,0.6,0.474,4.485,0.243,1.726
|
||||
mistralai/mistral-large-instruct-2411,65.63,63.25,0.794,0.732,0.636,0.505,5.873,0.296,1.935
|
||||
meta-llama/llama-3.1-8b-instruct,35.72,35.67,0.46,0.401,0.201,0.207,3.232,0.063,1.075
|
||||
meta-llama/llama-3.1-405b-instruct,60.09,58.76,0.791,0.633,0.582,0.471,4.515,0.243,1.712
|
||||
deepseek/deepseek-r1-distill-qwen-14b,,,,,,,,,
|
||||
mistralai/mistral-small-24b-instruct-2501,63.64,61.23,0.808,0.67,0.618,0.504,5.194,0.286,1.802
|
||||
liquid/lfm-7b,10.24,14.33,0.139,0.192,0.027,0.0,1.702,0.0,0.584
|
||||
|
@@ -0,0 +1,51 @@
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
from typing import List, Dict
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from .env if present
|
||||
load_dotenv()
|
||||
|
||||
BASE_URL = os.getenv("OPENAI_API_URL", "https://openrouter.ai/api/v1/chat/completions")
|
||||
API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
HEADERS = {
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
|
||||
RETRY_DELAY = int(os.getenv("RETRY_DELAY", "5"))
|
||||
|
||||
def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int = MAX_RETRIES) -> str:
|
||||
"""
|
||||
Sends user messages to the judge model with basic retry logic.
|
||||
Expects an OpenAI-compatible endpoint.
|
||||
"""
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
# temp and top_k are set to produce diversity in judge outputs between runs,
|
||||
# but constrained to be near the model's best answer (since we are doing numerical scoring).
|
||||
data = {
|
||||
"model": judge_model,
|
||||
"messages": messages,
|
||||
"temperature": 0.5,
|
||||
"top_k": 3,
|
||||
"max_tokens": 8096,
|
||||
#"provider": {
|
||||
# "order": [
|
||||
# "DeepSeek"
|
||||
# ]
|
||||
#}
|
||||
}
|
||||
response = requests.post(BASE_URL, headers=HEADERS, json=data)
|
||||
response.raise_for_status()
|
||||
res_json = response.json()
|
||||
return res_json['choices'][0]['message']['content']
|
||||
except Exception as e:
|
||||
logging.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}")
|
||||
if attempt == max_retries:
|
||||
logging.critical(f"Max retries reached for judge model {judge_model}")
|
||||
raise
|
||||
time.sleep(RETRY_DELAY)
|
||||
return ""
|
||||
@@ -0,0 +1,17 @@
|
||||
import json
|
||||
import logging
|
||||
|
||||
def load_json_file(file_path: str) -> dict:
|
||||
"""Loads a JSON file (returns empty if not found)."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
logging.warning(f"File {file_path} not found, returning empty dict.")
|
||||
return {}
|
||||
|
||||
def save_json_file(data: dict, file_path: str):
|
||||
"""Saves a dict to disk as JSON."""
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
logging.debug(f"Saved JSON data to {file_path}")
|
||||
@@ -0,0 +1,24 @@
|
||||
import os
|
||||
import logging
|
||||
|
||||
def setup_logging(verbosity: str):
|
||||
"""Set up logging based on verbosity level."""
|
||||
log_levels = {
|
||||
'DEBUG': logging.DEBUG,
|
||||
'INFO': logging.INFO,
|
||||
'WARNING': logging.WARNING,
|
||||
'ERROR': logging.ERROR,
|
||||
'CRITICAL': logging.CRITICAL
|
||||
}
|
||||
log_level = log_levels.get(verbosity.upper(), logging.INFO)
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format='%(asctime)s - %(levelname)s - %(threadName)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
def get_verbosity(args_verbosity: str) -> str:
|
||||
"""Determine the verbosity level from command-line or environment."""
|
||||
if args_verbosity:
|
||||
return args_verbosity
|
||||
return os.getenv("LOG_VERBOSITY", "INFO")
|
||||
@@ -0,0 +1,3 @@
|
||||
# Global control flags
|
||||
should_exit = False
|
||||
executor = None
|
||||
@@ -0,0 +1,11 @@
|
||||
def clamp(x: float, low: float = 0.0, high: float = 1.0) -> float:
|
||||
"""Clamp a value between low and high."""
|
||||
return max(low, min(x, high))
|
||||
|
||||
def normalize(val, min_val, max_val, bigger_is_better=True):
|
||||
if max_val <= min_val:
|
||||
return 0.0
|
||||
norm = (val - min_val) / (max_val - min_val)
|
||||
if not bigger_is_better:
|
||||
norm = 1.0 - norm
|
||||
return clamp(norm)
|
||||
@@ -0,0 +1,240 @@
|
||||
import re
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
from typing import Dict
|
||||
from scipy.stats import linregress
|
||||
from scipy.stats import spearmanr, theilslopes
|
||||
from config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
|
||||
|
||||
|
||||
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict):
|
||||
"""
|
||||
Produces two figures:
|
||||
• Figure #1 with three subplots side-by-side:
|
||||
(1) Raw Scores bar chart (+ 95% CI)
|
||||
(2) Calibrated Scores bar chart (+ 95% CI)
|
||||
(3) Heatmap of all per-criterion scores across each model (10 - score for negative markers).
|
||||
• Figure #2: A 4×4 grid of mini scatter plots, one per model (up to 16),
|
||||
showing item length (chars) vs. aggregated_score_raw. A linear regression
|
||||
line and correlation stats are included for each model if enough points exist.
|
||||
"""
|
||||
# -------------------------------------------------------------------
|
||||
# 1) The main (raw / calibrated / heatmap) figure
|
||||
# -------------------------------------------------------------------
|
||||
raw_stats = run_data["raw_model_stats"]
|
||||
cal_stats = run_data["calibrated_model_stats"]
|
||||
|
||||
if judge_model in MODEL_NAME_REPLACEMENTS:
|
||||
judge_model = MODEL_NAME_REPLACEMENTS[judge_model]
|
||||
|
||||
# All model names in raw_stats
|
||||
model_names = list(raw_stats.keys())
|
||||
|
||||
# Convert to arrays for sorting
|
||||
raw_means = [raw_stats[m]["mean"] for m in model_names]
|
||||
cal_means = [cal_stats[m]["mean"] for m in model_names]
|
||||
raw_cis = [raw_stats[m]["ci95"] for m in model_names]
|
||||
cal_cis = [cal_stats[m]["ci95"] for m in model_names]
|
||||
|
||||
# Sort by calibrated score descending
|
||||
sorted_indices = np.argsort(cal_means)[::-1]
|
||||
model_names = [model_names[i] for i in sorted_indices]
|
||||
raw_means = [raw_means[i] for i in sorted_indices]
|
||||
cal_means = [cal_means[i] for i in sorted_indices]
|
||||
raw_cis = [raw_cis[i] for i in sorted_indices]
|
||||
cal_cis = [cal_cis[i] for i in sorted_indices]
|
||||
|
||||
# 1.A) Build data for the heatmap: per-criterion scores (with negative flips)
|
||||
# We gather them from run_data "parsed_scores"
|
||||
# Then we convert them to 0..10 bins and store percentage distribution.
|
||||
all_scores_by_model = {m: [] for m in model_names}
|
||||
results = run_data.get("results", {})
|
||||
|
||||
for model_name in model_names:
|
||||
iter_dict = results.get(model_name, {})
|
||||
for iteration_key, item_dict in iter_dict.items():
|
||||
if not isinstance(item_dict, dict):
|
||||
continue
|
||||
for item_id, item_info in item_dict.items():
|
||||
if not isinstance(item_info, dict):
|
||||
continue
|
||||
parsed_scores = item_info.get("parsed_scores", {})
|
||||
if not isinstance(parsed_scores, dict):
|
||||
continue
|
||||
for crit_name, val in parsed_scores.items():
|
||||
if isinstance(val, (int, float)) and 0 <= val <= 10:
|
||||
crit_lower = crit_name.strip().lower()
|
||||
# Flip negative
|
||||
if any(nm in crit_lower for nm in NEGATIVE_MARKERS):
|
||||
final_val = 10 - val
|
||||
else:
|
||||
final_val = val
|
||||
all_scores_by_model[model_name].append(final_val)
|
||||
|
||||
# Convert to a 2D array for the heatmap (rows = models, columns = bins)
|
||||
bins = np.linspace(0, 10, 11)
|
||||
heatmap_rows = []
|
||||
for m in model_names:
|
||||
scores = all_scores_by_model[m]
|
||||
if scores:
|
||||
counts, _ = np.histogram(scores, bins=bins)
|
||||
pct = (counts / len(scores)) * 100.0
|
||||
else:
|
||||
pct = np.zeros(len(bins)-1, dtype=float)
|
||||
heatmap_rows.append(pct)
|
||||
heatmap_data = np.array(heatmap_rows, dtype=float)
|
||||
|
||||
# 1.B) Plot the main figure with 3 subplots
|
||||
fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7)) # Increased height
|
||||
|
||||
# Increase the base font size for all text elements
|
||||
plt.rcParams.update({'font.size': 14})
|
||||
|
||||
# (A) Raw bar chart
|
||||
y_pos = np.arange(len(model_names))
|
||||
ax1.barh(y_pos, raw_means, color='skyblue', alpha=0.7)
|
||||
for i, (mean_val, ci95) in enumerate(zip(raw_means, raw_cis)):
|
||||
ax1.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
|
||||
ax1.set_yticks(y_pos)
|
||||
ax1.set_yticklabels(model_names, fontsize=12)
|
||||
ax1.invert_yaxis()
|
||||
ax1.set_xlabel("Raw Scores", fontsize=14)
|
||||
ax1.set_title("Raw Model Scores (95% CI)", fontsize=16)
|
||||
ax1.grid(True, alpha=0.3)
|
||||
ax1.tick_params(axis='y', pad=10)
|
||||
|
||||
# (B) Calibrated bar chart
|
||||
ax2.barh(y_pos, cal_means, color='lightgreen', alpha=0.7)
|
||||
for i, (mean_val, ci95) in enumerate(zip(cal_means, cal_cis)):
|
||||
ax2.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
|
||||
ax2.set_yticks(y_pos)
|
||||
ax2.set_yticklabels(model_names, fontsize=12)
|
||||
ax2.invert_yaxis()
|
||||
ax2.set_xlabel("Calibrated Scores", fontsize=14)
|
||||
ax2.set_title("Calibrated Model Scores (95% CI)", fontsize=16)
|
||||
ax2.grid(True, alpha=0.3)
|
||||
ax2.tick_params(axis='y', pad=10)
|
||||
|
||||
# (C) Heatmap
|
||||
bin_edges = bins[:-1]
|
||||
ax3.set_xticks(np.arange(len(bin_edges)))
|
||||
ax3.set_xticklabels([str(int(be)) for be in bin_edges])
|
||||
im = ax3.imshow(heatmap_data, aspect='auto', origin='upper', cmap='plasma')
|
||||
#ax3.set_xticks(np.arange(len(bin_centers)))
|
||||
#ax3.set_xticklabels([f"{bc:.0f}" for bc in bin_centers], fontsize=12)
|
||||
ax3.set_yticks(np.arange(len(model_names)))
|
||||
ax3.set_yticklabels(model_names, fontsize=12)
|
||||
ax3.set_xlabel("Score Bin (0–10)", fontsize=14)
|
||||
ax3.set_title("Per-Criterion Score Distribution (Heatmap)", fontsize=16)
|
||||
ax3.tick_params(axis='y', pad=10)
|
||||
|
||||
cbar = plt.colorbar(im, ax=ax3)
|
||||
cbar.set_label("% of Criteria in Bin", fontsize=14)
|
||||
cbar.ax.tick_params(labelsize=12)
|
||||
cbar.formatter.set_powerlimits((0, 0))
|
||||
cbar.formatter = ticker.PercentFormatter(decimals=1)
|
||||
cbar.update_ticks()
|
||||
|
||||
# Overall title
|
||||
|
||||
sanitized_judge = re.sub(r"[^\w\-]", "-", judge_model.replace("/", "__"))
|
||||
fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}", fontsize=20)
|
||||
|
||||
# Adjust layout with more space
|
||||
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
|
||||
|
||||
plt.savefig(f"results/charts/judgemark_3chart_{sanitized_judge}.png",
|
||||
bbox_inches='tight',
|
||||
dpi=150,
|
||||
pad_inches=0.5)
|
||||
plt.close(fig1)
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# 2) Second Figure: A 4×4 grid of scatter plots (per-model),
|
||||
# where X = item length, Y = aggregated_score_raw.
|
||||
# We'll gather text straight from samples_data, compute len(...).
|
||||
# -------------------------------------------------------------------
|
||||
# If you want exactly 16 models, skip any extra or exclude some.
|
||||
# Example: exclude "gemini-1.5-pro-001"
|
||||
excluded_models = {"gemini-1.5-pro-001"}
|
||||
model_list_for_scatter = [m for m in model_names if m not in excluded_models]
|
||||
|
||||
# If it's still longer than 16, slice it to 16
|
||||
if len(model_list_for_scatter) > 16:
|
||||
model_list_for_scatter = model_list_for_scatter[:16]
|
||||
|
||||
# Build figure and subplots: 4x4
|
||||
fig2, axes2 = plt.subplots(4, 4, figsize=(20, 20)) # each cell is a scatter
|
||||
fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}", fontsize=18)
|
||||
|
||||
# We might have fewer than 16 models. We'll track them by row & col.
|
||||
for idx, mname in enumerate(model_list_for_scatter):
|
||||
row = idx // 4
|
||||
col = idx % 4
|
||||
ax = axes2[row, col]
|
||||
|
||||
# Collect all (length, raw_score) for this model
|
||||
length_vals = []
|
||||
score_vals = []
|
||||
|
||||
# For each (iteration_key, item_id), find the text in samples_data,
|
||||
# find aggregated_score_raw in run_data, then store pairs
|
||||
model_res = run_data["results"].get(mname, {})
|
||||
for it_key, it_dict in model_res.items():
|
||||
if not isinstance(it_dict, dict):
|
||||
continue
|
||||
for item_id, item_info in it_dict.items():
|
||||
if not isinstance(item_info, dict):
|
||||
continue
|
||||
raw_score = item_info.get("aggregated_score_raw", None)
|
||||
if not isinstance(raw_score, (int, float)):
|
||||
continue
|
||||
|
||||
# Look up the text in samples_data:
|
||||
text = (samples_data
|
||||
.get(mname, {})
|
||||
.get("samples", {})
|
||||
.get(it_key, {})
|
||||
.get(item_id, ""))
|
||||
text_len = len(text)
|
||||
|
||||
# If it's non-empty text
|
||||
if text_len > 0:
|
||||
length_vals.append(text_len)
|
||||
score_vals.append(raw_score)
|
||||
|
||||
ax.set_title(mname, fontsize=12)
|
||||
ax.set_xlabel("Length")
|
||||
ax.set_ylabel("Raw Score")
|
||||
|
||||
if len(length_vals) > 1:
|
||||
ax.scatter(length_vals, score_vals, alpha=0.4, color='blue')
|
||||
|
||||
# -- Rank-based correlation (Spearman) --
|
||||
rho, p_value = spearmanr(length_vals, score_vals)
|
||||
|
||||
# -- Robust linear fit (Theil-Sen) --
|
||||
# returns slope, intercept, lower_slope, upper_slope
|
||||
slope, intercept, lo_slope, hi_slope = theilslopes(score_vals, length_vals, alpha=0.95)
|
||||
|
||||
# Build the line
|
||||
xline = np.linspace(min(length_vals), max(length_vals), 200)
|
||||
yline = slope * xline + intercept
|
||||
ax.plot(xline, yline, color='red', linewidth=2,
|
||||
label=f"Spearman ρ={rho:.2f}, p={p_value:.2g}")
|
||||
ax.legend(loc="best")
|
||||
else:
|
||||
ax.text(0.5, 0.5, "Not enough data",
|
||||
ha='center', va='center', color='red', transform=ax.transAxes)
|
||||
|
||||
# If we have fewer than 16 models, blank out remaining subplots
|
||||
total_subplots = 16
|
||||
for i in range(len(model_list_for_scatter), total_subplots):
|
||||
row = i // 4
|
||||
col = i % 4
|
||||
axes2[row, col].axis("off")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(f"results/charts/judgemark_scattergrid_{sanitized_judge}.png", bbox_inches='tight', dpi=200)
|
||||
plt.close(fig2)
|
||||