first commit

2026-06-27 16:10:14 +08:00 · 2025-01-31 18:03:33 +11:00
commit 7b9ed0746f
60 changed files with 111225 additions and 0 deletions
@@ -0,0 +1,8 @@
+.env
+.vscode
+backup
+judgemark_v2_runs.json
+judgemark_v2.1_runs.json
+ministral-repetition-results.json
+*.pyc
+dev
@@ -0,0 +1,55 @@
+"""Global constants and reference scores for the Judgemark-v2 benchmark."""
+
+# Stability test configuration
+STABILITY_ITEMS = [
+    ("gemma-2b-it", "2", "28"),
+    ("Llama-3-70b-chat-hf", "1", "42"),
+    ("DeepSeek-R1", "1", "2"),
+]
+STABILITY_REPS = 100
+
+# Reference scores for correlation
+REFERENCE_MODEL_SCORES = {
+    "DeepSeek-R1": 1430, # !! need to update this
+    "gpt-4o-2024-11-20": 1402,
+    "gemini-1.5-pro-002": 1333,
+    "gemini-1.5-pro-001": 1276,
+    "claude-3-5-sonnet-20240620": 1243,
+    "Mistral-Large-Instruct-2411": 1246,
+    "claude-3-opus-20240229": 1240,
+    #"c4ai-command-r-plus-08-2024": 1236,
+    "Llama-3-70b-chat-hf": 1214,
+    "claude-3-haiku-20240307": 1163,
+    "c4ai-command-r-08-2024": 1159,
+    "Mixtral-8x22B-Instruct-v0.1": 1147,
+    "Mixtral-8x7B-Instruct-v0.1": 1114,
+    "databricks/dbrx-instruct": 1102,
+    #"openchat-3.5-1210": 1127,
+    "gpt-3.5-turbo-0125": 1099,
+    "Llama-2-13b-chat-hf": 1050,
+    "gemma-7b-it": 1029,
+    "gemma-2b-it": 989,
+}
+
+# Negative criteria markers for score computation
+NEGATIVE_MARKERS = [
+    "melodramatic", "shallow resolution", "unearned resolution",
+    "simplistic moralizing", "shallow optimism", "forced optimism", 
+    "trite", "overwrought", "amateurish", "contrived", "uninspiring",
+    "characters are too good", "incongruent ending positivity",
+    "unearned transformations", "profundity over-reach",
+    "amateurish descriptives", "clunky asides", "stilted dialogue",
+    "tit-for-tat dialogue", "purple prose", "uncreative", "tell-don't-show",
+    "weak dialogue", "meandering"
+]
+
+MODEL_NAME_REPLACEMENTS = {
+    "mistralai/ministral-3b": "ministral/Ministral-3b-instruct",
+    "openai/gpt-4o-mini": "gpt-4o-mini",
+    "deepseek/deepseek-r1": "deepseek-ai/deepseek-r1",
+    "anthropic/claude-3.5-haiku-20241022": "claude-3.5-haiku-20241022",
+    "anthropic/claude-3.5-sonnet-20240620": "claude-3.5-sonnet-20240620",
+    "openai/gpt-4o-2024-11-20": "gpt-4o-2024-11-20",
+    "deepseek/deepseek-r1-distill-llama-70b": "deepseek-ai/deepseek-r1-distill-llama-70b",
+    "mistralai/mistral-large-2411": "mistralai/mistral-large-instruct-2411",
+}
@@ -0,0 +1,514 @@
+import os
+import re
+import uuid
+import time
+import signal
+import logging
+import threading
+import concurrent.futures
+from datetime import datetime
+from typing import Dict, List
+from tqdm import tqdm
+from collections import defaultdict
+
+from utils.file_io import load_json_file, save_json_file
+from utils.api import send_to_judge_model
+from utils.visualization import create_side_by_side_score_charts
+import statistics
+from core.scoring import (
+    parse_scores, compute_raw_score, compute_detailed_distribution,
+    compute_model_level_stats, compute_cross_model_stats,
+    build_landmark_calibration_config, apply_landmark_calibration,
+    log_score_summary, confidence_interval_95
+)
+from core.scoring import compute_detailed_distribution, compute_detailed_distribution  # etc
+from core.separability import compute_separability_metrics
+from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
+from utils.stats import clamp
+from utils.state import should_exit, executor
+from utils.stats import normalize
+
+def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str, 
+                  prompt_template: str, run_key: str, runs: Dict, runs_file: str,
+                  lock: threading.Lock, judge_model: str, save_raw_judge_output: bool):
+    """Process a single sample, retrying failed or empty results."""
+    global should_exit
+    if should_exit:
+        return
+    
+    text_len = len(item_text)
+    run_data = runs.get(run_key, {})
+    results = run_data.get("results", {})
+    model_dict = results.setdefault(model_name, {})
+    iteration_dict = model_dict.setdefault(iteration_key, {})
+    
+    existing_item = iteration_dict.get(item_id, {})
+    if (existing_item and 
+        "aggregated_score_raw" in existing_item and 
+        existing_item.get("parsed_scores") and 
+        len(existing_item["parsed_scores"]) >= 10 and
+        existing_item["aggregated_score_raw"] > 0.0):
+        return
+    
+    try:
+        final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
+        final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")
+        
+        messages = [{"role": "user", "content": final_prompt}]
+        judge_response = send_to_judge_model(messages, judge_model=judge_model)
+        
+        extracted_scores = parse_scores(judge_response)
+        raw_score = compute_raw_score(extracted_scores)
+        
+        with lock:
+            storage_dict = {
+                "parsed_scores": extracted_scores,
+                "timestamp": datetime.now().isoformat(),
+                "text_length": text_len
+            }
+            if raw_score is not None:
+                storage_dict["aggregated_score_raw"] = raw_score
+            if save_raw_judge_output:
+                storage_dict["judge_response"] = judge_response
+            
+            iteration_dict[item_id] = storage_dict
+            runs[run_key]["results"][model_name][iteration_key] = iteration_dict
+            save_json_file(runs, runs_file)
+        
+        if raw_score is not None:
+            logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
+        else:
+            logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
+            
+    except Exception as e:
+        logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
+        with lock:
+            iteration_dict[item_id] = {
+                "error": str(e),
+                "timestamp": datetime.now().isoformat()
+            }
+            if "errors" not in runs[run_key]:
+                runs[run_key]["errors"] = []
+            runs[run_key]["errors"].append({
+                "model": model_name,
+                "iteration": iteration_key,
+                "item_id": item_id,
+                "error": str(e)
+            })
+            save_json_file(runs, runs_file)
+
+def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict):
+    """
+    Compute metrics for both raw and calibrated scores, including stability tests,
+    normalized components, and detailed distributions.
+    
+    Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
+    """
+    run_data = runs[run_key]
+    results = run_data.get("results", {})
+
+    # 1. Collect raw scores, compute calibration, store calibrated values
+    raw_scores_by_model_all = defaultdict(list)
+    raw_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
+    calibrated_scores_by_model_all = defaultdict(list)
+    calibrated_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
+    lengths_by_model = {}
+
+    # -- Collect raw scores
+    for model_name, iteration_data in results.items():
+        if not isinstance(iteration_data, dict):
+            continue
+        
+        lengths = []
+        for it_key, it_val in iteration_data.items():
+            if it_key == "__model_stats__":
+                continue
+            if not isinstance(it_val, dict):
+                continue
+                
+            for item_id, item_info in it_val.items():
+                if (isinstance(item_info, dict) and 
+                    "aggregated_score_raw" in item_info):
+                    raw_score = item_info["aggregated_score_raw"]
+                    
+                    # Collect raw score globally
+                    raw_scores_by_model_all[model_name].append(raw_score)
+                    # Collect raw score by iteration
+                    raw_scores_by_model_by_iter[model_name][it_key].append(raw_score)
+
+                    # Track text length for analyzing
+                    text = (samples_data.get(model_name, {})
+                            .get("samples", {})
+                            .get(it_key, {})
+                            .get(item_id, ""))
+                    lengths.append(len(text))
+        
+        if len(raw_scores_by_model_all[model_name]) > 0:
+            lengths_by_model[model_name] = lengths
+
+    # 2. Distribution + calibration
+    all_raw_scores = [s for scores in raw_scores_by_model_all.values() for s in scores]
+    run_data["raw_score_distribution"] = compute_detailed_distribution(all_raw_scores)
+
+    calibration_config = build_landmark_calibration_config(all_raw_scores, [0, 3, 5, 7, 10])
+    run_data["calibration_config"] = calibration_config
+
+    # Apply calibration
+    for model_name, iteration_data in results.items():
+        if not isinstance(iteration_data, dict):
+            continue
+        
+        # Flatten model's raw scores, calibrate them
+        raw_list = raw_scores_by_model_all[model_name]
+        calibrated = [apply_landmark_calibration(s, calibration_config) for s in raw_list]
+        
+        # Re-walk iteration_data to assign each calibration back
+        idx = 0
+        for it_key, it_val in iteration_data.items():
+            if it_key == "__model_stats__":
+                continue
+            if not isinstance(it_val, dict):
+                continue
+            for item_id, item_info in it_val.items():
+                if (isinstance(item_info, dict) and 
+                    "aggregated_score_raw" in item_info):
+                    item_info["aggregated_score_calibrated"] = calibrated[idx]
+                    idx += 1
+        
+        # Update calibrated_scores_by_model_by_iter in the same breakdown
+        idx2 = 0
+        for it_key in raw_scores_by_model_by_iter[model_name]:
+            count_for_iter = len(raw_scores_by_model_by_iter[model_name][it_key])
+            these_cals = calibrated[idx2 : idx2 + count_for_iter]
+            calibrated_scores_by_model_by_iter[model_name][it_key].extend(these_cals)
+            idx2 += count_for_iter
+        
+        # Populate the single flattened list of calibrated scores
+        calibrated_scores_by_model_all[model_name].extend(calibrated)
+
+    # 3. Calibrated distributions
+    all_calibrated_scores = [
+        s for scores in calibrated_scores_by_model_all.values() for s in scores
+    ]
+    run_data["calibrated_score_distribution"] = compute_detailed_distribution(all_calibrated_scores)
+
+    # 4. Model-level stats
+    run_data["raw_model_stats"] = compute_model_level_stats(raw_scores_by_model_all, lengths_by_model)
+    run_data["calibrated_model_stats"] = compute_model_level_stats(calibrated_scores_by_model_all, lengths_by_model)
+
+    # 5. Cross-model stats
+    run_data["raw_cross_model_stats"] = compute_cross_model_stats(
+        scores_by_model_all=raw_scores_by_model_all,
+        scores_by_model_by_iter=raw_scores_by_model_by_iter
+    )
+    run_data["calibrated_cross_model_stats"] = compute_cross_model_stats(
+        scores_by_model_all=calibrated_scores_by_model_all,
+        scores_by_model_by_iter=calibrated_scores_by_model_by_iter
+    )
+
+    # 6. Separability metrics
+    compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
+    compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
+
+    
+    # 8. Compute iteration stability for raw & calibrated
+    compute_iteration_stability(run_data, label="raw")  
+    compute_iteration_stability(run_data, label="calibrated")
+    random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
+    random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
+    logging.info("Score stability (RAW)")
+    logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
+    logging.info("Score stability (CALIBRATED)") 
+    logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
+                 f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
+
+    # 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
+
+    # -- (A) RAW Judgemark
+    # Pull out raw stats + separability metrics
+    raw_stats = run_data["raw_cross_model_stats"]
+    raw_norm = raw_stats["normalized_components"]  # "std_dev", "kw_stat", etc.
+    
+    # Add your own normalization steps as needed
+    raw_emd = run_data["separability_metrics"]["raw"]["emd"]["average"]
+    raw_emd_norm = normalize(raw_emd, 0, 4)
+    raw_overlap_mag = run_data["separability_metrics"]["raw"]["ci99_overlap_magnitude_sum"]
+    raw_overlap_mag_norm = normalize(raw_overlap_mag, 0, 26, False)
+    raw_norm["ci99_overlap_magnitude_sum_norm"] = raw_overlap_mag_norm
+
+    # Range of raw model means
+    raw_score_range = (
+        max(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
+        - min(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
+    )
+    run_data["raw_score_range"] = raw_score_range
+    raw_score_range_norm = normalize(raw_score_range, 0, 8)
+    raw_norm["raw_score_range_norm"] = raw_score_range_norm
+
+    # Add Kendall's tau from the randomization-based stability measure
+    raw_norm["kendall_tau_bootstrapped"] = normalize(random_tau_raw, 0.4, 1.0)
+
+    # compute an aggregated separability metric
+    raw_separability = (
+        raw_norm["std_dev"] # std deviation *between* models (separability)
+        + raw_norm["kw_stat"] # kruskal-wallis (separability)
+        + raw_norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
+        + raw_norm["raw_score_range_norm"] # range of assigned scores (separability)
+        + run_data["separability_metrics"]["raw"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
+        + raw_emd_norm # earth-movers distance (separability)
+    ) / 6.0
+
+    # Combine into final raw Judgemark
+    final_score_raw = (
+        raw_norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
+        + raw_norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)        
+        + 4 * raw_separability # aggregate of separability metrics
+    ) / 6.0
+    run_data["final_judgemark_score_elements_raw"] = {
+        "norm_stability_between_iterations": raw_norm["kendall_tau_bootstrapped"],
+        "norm_correlation_with_lmsys_arena": raw_norm["kendall_tau"],
+        "norm_std_dev_between_models": raw_norm["std_dev"],
+        "norm_kruskall_wallis": raw_norm["kw_stat"],
+        "norm_ci99_adjacent_overlap": raw_norm["ci99_overlap_magnitude_sum_norm"],
+        "norm_score_range": raw_norm["raw_score_range_norm"],
+        "norm_intra_model_ci95": run_data["separability_metrics"]["raw"]["modulated_ci95"],
+        "norm_earth_movers_distance": raw_emd_norm
+    }
+    run_data["final_judgemark_score_raw"] = final_score_raw
+
+    # -- (B) Calibrated Judgemark
+    cal_stats = run_data["calibrated_cross_model_stats"]
+    norm = cal_stats["normalized_components"]
+
+    emd_norm = normalize(run_data["separability_metrics"]["calibrated"]["emd"]["average"], 0, 4)
+    overlap_magnitude_norm = normalize(
+        run_data["separability_metrics"]["calibrated"]["ci99_overlap_magnitude_sum"], 0, 26, False
+    )
+    norm["ci99_overlap_magnitude_sum_norm"] = overlap_magnitude_norm
+
+    # Range of calibrated model means
+    calibrated_score_range = (
+        max(run_data["calibrated_model_stats"][model]["mean"]
+            for model in run_data["calibrated_model_stats"])
+        - min(run_data["calibrated_model_stats"][model]["mean"]
+              for model in run_data["calibrated_model_stats"])
+    )
+    run_data["calibrated_score_range"] = calibrated_score_range
+    calibrated_score_range_norm = normalize(calibrated_score_range, 0, 8)
+    norm["calibrated_score_range_norm"] = calibrated_score_range_norm
+
+    # Kendall's tau from the randomized stability measure
+    norm["kendall_tau_bootstrapped"] = normalize(random_tau_cal, 0.4, 1.0)
+
+    # compute an aggregated separability metric
+    calibrated_separability = (
+        norm["std_dev"] # std deviation *between* models (separability)
+        + norm["kw_stat"] # kruskal-wallis (separability)
+        + norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
+        + norm["calibrated_score_range_norm"] # range of assigned scores (separability)
+        + run_data["separability_metrics"]["calibrated"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
+        + emd_norm # earth-movers distance (separability)
+    ) / 6.0
+
+    final_score_calibrated = (
+        norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)        
+        + norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)                
+        + 4 * calibrated_separability # aggregate of separability metrics  
+    ) / 6.0
+    run_data["final_judgemark_score_elements_calibrated"] = {
+        "norm_stability_between_iterations": norm["kendall_tau_bootstrapped"],
+        "norm_correlation_with_lmsys_arena": norm["kendall_tau"],
+        "norm_std_dev_between_models": norm["std_dev"],
+        "norm_kruskall_wallis": norm["kw_stat"],
+        "norm_ci99_adjacent_overlap": norm["ci99_overlap_magnitude_sum_norm"],
+        "norm_score_range": norm["calibrated_score_range_norm"],
+        "norm_intra_model_ci95": run_data["separability_metrics"]["calibrated"]["modulated_ci95"],
+        "norm_earth_movers_distance": norm
+    }
+    run_data["final_judgemark_score"] = final_score_calibrated
+
+    # 10. Create visualizations + logs
+    create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data)
+    
+    log_score_summary(
+        "RAW SCORES", 
+        run_data["raw_cross_model_stats"], 
+        run_data["raw_model_stats"]
+    )
+    log_score_summary(
+        "CALIBRATED SCORES", 
+        run_data["calibrated_cross_model_stats"],
+        run_data["calibrated_model_stats"]
+    )
+
+    logging.info(f"Final Judgemark (raw)   = {final_score_raw:.3f}")
+    logging.info(f"Final Judgemark (cal)  = {final_score_calibrated:.3f}")
+
+
+def sanitize_model_name(name: str) -> str:
+    """Sanitize judge model name for use in the run key."""
+    return re.sub(r'[^a-zA-Z0-9_-]+', '_', name)
+
+def run_judgemark_v2(
+    judge_model: str,
+    samples_file: str,
+    prompts_file: str,
+    runs_file: str,
+    num_threads: int,
+    run_id: str = None,
+    save_raw_judge_output: bool = False
+) -> str:
+    global executor, should_exit
+    
+    logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
+    runs = load_json_file(runs_file)
+    
+    # Form the run key using run_id + "__" + sanitized judge model
+    sanitized_jm = sanitize_model_name(judge_model)
+    base_id = run_id if run_id else str(uuid.uuid4())
+    run_key = f"{base_id}__{sanitized_jm}"
+    
+    # Load data files
+    samples_data = load_json_file(samples_file)
+    judge_prompts = load_json_file(prompts_file)
+    
+    # Initialize or get existing run data
+    if run_key not in runs:
+        runs[run_key] = {
+            "judge_model": judge_model,
+            "start_time": datetime.now().isoformat(),
+            "status": "running",
+            "samples_file": samples_file,
+            "prompts_file": prompts_file,
+            "results": {}
+        }
+        save_json_file(runs, runs_file)
+    
+    run_data = runs[run_key]
+    items_to_process = []
+    
+    # If run exists, scan for items needing retry
+    if "results" in run_data:
+        results = run_data.get("results", {})
+        
+        # Scan all possible items
+        for model_name, model_info in samples_data.items():
+            samples_dict = model_info.get("samples", {})
+            for iteration_key, iteration_items in samples_dict.items():
+                for item_id, item_text in iteration_items.items():
+                    # Check if this item needs processing
+                    existing_result = (results.get(model_name, {})
+                                            .get(iteration_key, {})
+                                            .get(item_id, {}))
+                    
+                    needs_retry = (
+                        not existing_result or
+                        not existing_result.get("parsed_scores") or
+                        len(existing_result.get("parsed_scores", {})) < 10 or
+                        existing_result.get("aggregated_score_raw", 0.0) == 0.0 or
+                        "error" in existing_result
+                    )
+                    
+                    if needs_retry:
+                        items_to_process.append({
+                            "model_name": model_name,
+                            "iteration_key": iteration_key,
+                            "item_id": item_id,
+                            "item_text": item_text,
+                            "prompt_template": judge_prompts.get(item_id, "")
+                        })
+        
+        if items_to_process:
+            logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
+        else:
+            logging.info(f"No items to process in existing run {run_key}")
+
+    else:
+        # New run - process all items
+        for model_name, model_info in samples_data.items():
+            samples_dict = model_info.get("samples", {})
+            for iteration_key, iteration_items in samples_dict.items():
+                print(iteration_key)
+                for item_id, item_text in iteration_items.items():
+                    items_to_process.append({
+                        "model_name": model_name,
+                        "iteration_key": iteration_key,
+                        "item_id": item_id,
+                        "item_text": item_text,
+                        "prompt_template": judge_prompts.get(item_id, "")
+                    })
+    
+    # Ensure concurrency lock
+    lock = threading.Lock()
+    
+    # Process any items that need retrying
+    all_futures = []
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
+            executor = exec_
+            
+            if items_to_process:
+                # Process all items (either retries or new run)
+                for item in items_to_process:
+                    if should_exit:
+                        break
+                        
+                    all_futures.append(
+                        executor.submit(
+                            process_sample,
+                            item["model_name"],
+                            item["iteration_key"],
+                            item["item_id"],
+                            item["item_text"],
+                            item["prompt_template"],
+                            run_key,
+                            runs,
+                            runs_file,
+                            lock,
+                            judge_model,
+                            save_raw_judge_output
+                        )
+                    )
+                
+                # Display progress bar for tasks
+                for f in tqdm(concurrent.futures.as_completed(all_futures), 
+                              total=len(all_futures), desc="Judging", leave=True):
+                    if should_exit:
+                        break
+                    try:
+                        f.result()
+                    except Exception as exc:
+                        logging.error(f"Exception in worker thread: {exc}")
+    
+    except KeyboardInterrupt:
+        logging.warning("KeyboardInterrupt caught in main thread.")
+        should_exit = True
+        time.sleep(0.1)
+    finally:
+        # Mark run as interrupted or completed
+        status = "interrupted" if should_exit else "completed"
+        runs[run_key]["status"] = status
+        runs[run_key]["end_time"] = datetime.now().isoformat()
+        
+        if not should_exit:
+            # Run stability test
+            if False:
+                run_stability_test(
+                    run_data, judge_model,
+                    judge_prompts, samples_data,
+                    runs, runs_file,
+                    lock, num_threads
+                )
+            # Compute final stats
+            finalize_scores_and_compute_judgemark(runs, run_key, samples_data)
+
+        # Save final
+        save_json_file(runs, runs_file)
+        
+        if executor:
+            logging.info("Shutting down executor")
+            executor.shutdown(wait=False)
+            executor = None
+    
+    logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
+    return run_key
@@ -0,0 +1,275 @@
+import math
+import statistics
+import numpy as np
+import scipy.stats
+import logging
+from typing import Dict, List
+import re
+from config.constants import REFERENCE_MODEL_SCORES
+from utils.stats import normalize
+
+def parse_scores(judge_model_response: str) -> Dict[str,float]:
+    """
+    Extracts zero or more named numeric scores from a text using a simple Regex pattern:
+
+      <metric name>: <score>
+
+    The metric name can be any string without newlines or colons.
+    The score can be a positive or negative float or integer.
+    Example lines in the judge output might be:
+      "Realism Score: 7.5"
+      "Melodramatic: 2"
+    """
+    scores = {}
+    # Look for lines or statements like "Something: 3.5" or "Something Score 3.5"
+    pattern = r'(.*?):\s*(?:Score\s+)?(-?\d+(?:\.\d+)?)'
+    matches = re.findall(pattern, judge_model_response)
+    for match in matches:
+        metric_name = match[0].strip()
+        numeric_val = float(match[1])
+        scores[metric_name] = numeric_val
+    return scores
+
+def compute_raw_score(scores: Dict[str,float]) -> float:
+    """
+    Given a dict of {criteria: numeric score}, compute a single raw score by adjusting 
+    negative-themed criteria by inverting them, then normalizing to 0-10 scale.
+    """
+    valid_scores = {k: v for k, v in scores.items() if 0 <= v <= 10}
+    
+    if len(valid_scores) < 10:
+        return None
+    
+    negative_markers = [
+        "melodramatic", "shallow resolution", "unearned resolution",
+        "simplistic moralizing", "shallow optimism", "forced optimism", 
+        "trite", "overwrought", "amateurish", "contrived", "uninspiring",
+        "characters are too good", "incongruent ending positivity",
+        "unearned transformations", "profundity over-reach",
+        "amateurish descriptives", "clunky asides", "stilted dialogue",
+        "tit-for-tat dialogue", "purple prose", "uncreative", "tell-don't-show",
+        "weak dialogue", "meandering"
+    ]
+    
+    sum_val = 0.0
+    for criteria, val in valid_scores.items():
+        crit_lower = criteria.lower().strip()
+        if any(neg in crit_lower for neg in negative_markers):
+            sum_val += (10 - val)
+        else:
+            sum_val += val
+    
+    avg_val = sum_val / len(valid_scores)
+    return round(avg_val, 2)
+
+def confidence_interval_95(data: List[float]) -> float:
+    """
+    Computes the 95% confidence interval for the mean using normal approximations:
+    CI95 = 1.96 * (std / sqrt(n)), for n>30 or so.
+    """
+    n = len(data)
+    if n < 2:
+        return 0.0
+    mean_ = statistics.mean(data)
+    stdev_ = statistics.pstdev(data) if n == 1 else statistics.stdev(data)
+    ci95 = 1.96 * (stdev_ / math.sqrt(n))
+    return ci95
+
+def compute_detailed_distribution(scores):
+    if not scores:
+        return {}
+    return {
+        "count": len(scores),
+        "min": round(min(scores), 3),
+        "max": round(max(scores), 3),
+        "mean": round(statistics.mean(scores), 3),
+        "median": round(statistics.median(scores), 3),
+        "stdev": round(statistics.stdev(scores) if len(scores) > 1 else 0.0, 3),
+        "p10": round(float(np.percentile(scores, 10)), 3),
+        "p25": round(float(np.percentile(scores, 25)), 3),
+        "p75": round(float(np.percentile(scores, 75)), 3),
+        "p90": round(float(np.percentile(scores, 90)), 3)
+    }
+
+def compute_model_level_stats(scores_by_model, lengths_by_model):
+    model_stats = {}
+    for model_name, scores in scores_by_model.items():
+        lengths = lengths_by_model[model_name]
+        stats = {
+            "count": len(scores),
+            "mean": statistics.mean(scores),
+            "median": statistics.median(scores),
+            "stdev": statistics.stdev(scores) if len(scores) > 1 else 0.0,
+            "ci95": confidence_interval_95(scores),
+            "min": min(scores),
+            "max": max(scores)
+        }
+        
+        # Length correlation
+        if len(lengths) == len(scores):
+            corr, _ = scipy.stats.pearsonr(lengths, scores)
+            stats["length_correlation"] = corr
+        
+        model_stats[model_name] = stats
+    return model_stats
+
+def compute_cross_model_stats(scores_by_model_all, scores_by_model_by_iter):
+    """
+    Computes cross-model stats. ANOVA, Kruskal, and std_dev_across_models
+    are calculated over all scores. Pearson/Kendall correlations are now
+    computed on a per-iteration basis and then averaged.
+    """
+    # ANOVA/Kruskal portion remains over ALL model scores
+    arrays = list(scores_by_model_all.values())
+    f_stat, f_p = scipy.stats.f_oneway(*arrays)
+    kw_stat, kw_p = scipy.stats.kruskal(*arrays)
+
+    # Standard deviation across model means (over entire distribution)
+    model_means = [statistics.mean(scores) for scores in arrays]
+    std_across_models = statistics.pstdev(model_means)
+
+    # --------------------
+    # Compute correlation stats per iteration, then average
+    # --------------------
+    iteration_keys = set()
+    for model, by_iter_dict in scores_by_model_by_iter.items():
+        iteration_keys |= set(by_iter_dict.keys())  # union of all iteration keys
+
+    iteration_pearsons = []
+    iteration_kendalls = []
+    
+    for it_key in iteration_keys:
+        # Gather means for each model in this iteration
+        model_means_dict = {}
+        for model, by_iter_dict in scores_by_model_by_iter.items():
+            if it_key in by_iter_dict and len(by_iter_dict[it_key]) > 0:
+                model_means_dict[model] = statistics.mean(by_iter_dict[it_key])
+        
+        # Pair each model's iteration-mean with reference score
+        ref_pairs = []
+        for m, mean_val in model_means_dict.items():
+            if m in REFERENCE_MODEL_SCORES:
+                ref_pairs.append((mean_val, REFERENCE_MODEL_SCORES[m]))
+        
+        # If enough models exist for correlation, compute it
+        if len(ref_pairs) >= 2:
+            means, refs = zip(*ref_pairs)
+            p_r, _ = scipy.stats.pearsonr(means, refs)
+            k_tau, _ = scipy.stats.kendalltau(means, refs)
+        else:
+            p_r, k_tau = (0.0, 0.0)
+        
+        iteration_pearsons.append(p_r)
+        iteration_kendalls.append(k_tau)
+    
+    # Final correlation = average across iteration-level correlations
+    if iteration_pearsons:
+        pearson_r = statistics.mean(iteration_pearsons)
+    else:
+        pearson_r = 0.0
+
+    if iteration_kendalls:
+        kendall_tau = statistics.mean(iteration_kendalls)
+    else:
+        kendall_tau = 0.0
+
+    return {
+        "anova_f": f_stat,
+        "anova_p": f_p,
+        "kw_stat": kw_stat,
+        "kw_p": kw_p,
+        "std_dev_across_models": std_across_models,
+        "pearson_r": pearson_r,
+        "kendall_tau": kendall_tau,
+        "normalized_components": {
+            "pearson_r": normalize(pearson_r, 0.7, 1.0),
+            "kendall_tau": normalize(kendall_tau, 0.1, 1.0),
+            "anova_f": normalize(f_stat, 0.0, 350.0),
+            "kw_stat": normalize(kw_stat, 0.0, 1500.0),
+            "std_dev": normalize(std_across_models, 0.0, 2.2)
+        }
+    }
+
+def build_landmark_calibration_config(scores, desired_points=None):
+    """
+    Creates a piecewise-linear calibration from these raw distribution 
+    landmarks: [min, Q1, median, Q3, max]
+    to the given desired_points, e.g. [0, 3, 5, 7, 10].
+    Returns a dict describing how to transform future scores.
+    """
+    if not scores or len(scores) < 2:
+        # Degenerate case: no meaningful distribution
+        return {
+            "method": "piecewise_landmark",
+            "in_landmarks": [],
+            "out_landmarks": []
+        }
+
+    if desired_points is None:
+        desired_points = [0, 3, 5, 7, 10]
+
+    in_min = min(scores)
+    in_q1 = float(np.percentile(scores, 25))
+    in_med = float(statistics.median(scores))
+    in_q3 = float(np.percentile(scores, 75))
+    in_max = max(scores)
+
+    return {
+        "method": "piecewise_landmark",
+        "in_landmarks": [in_min, in_q1, in_med, in_q3, in_max],
+        "out_landmarks": desired_points
+    }
+
+def apply_landmark_calibration(x, config):
+    """
+    Apply the piecewise-linear transform defined by config:
+      "in_landmarks" = [minVal, q1Val, medVal, q3Val, maxVal]
+      "out_landmarks" = [outMin, outQ1, outMed, outQ3, outMax].
+    If x is < min or > max, we extrapolate linearly beyond that segment.
+    """
+    inL = config.get("in_landmarks", [])
+    outL = config.get("out_landmarks", [])
+    if len(inL) != 5 or len(outL) != 5:
+        # Invalid or degenerate config => just return x unchanged
+        return x
+
+    in_min, in_q1, in_med, in_q3, in_max = inL
+    out_min, out_q1, out_med, out_q3, out_max = outL
+
+    def linear_map(val, old_lo, old_hi, new_lo, new_hi):
+        if abs(old_hi - old_lo) < 1e-12:
+            return new_lo
+        frac = (val - old_lo) / (old_hi - old_lo)
+        return new_lo + frac * (new_hi - new_lo)
+
+    # Determine which segment x belongs to:
+    if x <= in_q1:
+        # (in_min -> in_q1) -> (out_min -> out_q1), but possibly x < in_min => extrapolate
+        return linear_map(x, in_min, in_q1, out_min, out_q1)
+    elif x <= in_med:
+        return linear_map(x, in_q1, in_med, out_q1, out_med)
+    elif x <= in_q3:
+        return linear_map(x, in_med, in_q3, out_med, out_q3)
+    else:
+        # (in_q3 -> in_max) -> (out_q3 -> out_max), possibly x > in_max => extrapolate
+        return linear_map(x, in_q3, in_max, out_q3, out_max)
+
+def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
+    """Log a readable summary of score statistics."""
+    logging.info(f"\n------- {score_type} Summary -------")
+    logging.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
+    logging.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
+    logging.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
+    logging.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
+    logging.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
+    
+    logging.info("\nModel Scores:")
+    sorted_models = sorted(
+        model_stats.items(),
+        key=lambda kv: kv[1]["mean"],
+        reverse=True
+    )
+    for model, stats in sorted_models:
+        line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
+        logging.info(line)
+    logging.info("------------------------------------")
@@ -0,0 +1,233 @@
+
+import logging
+import math
+import statistics
+import numpy as np
+import scipy.stats
+from typing import Dict, List
+from utils.stats import normalize
+
+try:
+    from scipy.stats import wasserstein_distance
+    HAS_WASSERSTEIN = True
+except ImportError:
+    HAS_WASSERSTEIN = False
+
+def cohen_d(scores1: List[float], scores2: List[float]) -> float:
+    """
+    Compute Cohen's d for two sets of scores.
+    d = (mean2 - mean1) / pooled_stdev
+    """
+    if len(scores1) < 2 or len(scores2) < 2:
+        return 0.0
+    mean1, mean2 = statistics.mean(scores1), statistics.mean(scores2)
+    var1, var2 = statistics.pvariance(scores1), statistics.pvariance(scores2)
+    n1, n2 = len(scores1), len(scores2)
+    pooled_var = ((n1 - 1)*var1 + (n2 - 1)*var2) / (n1 + n2 - 2)
+    if pooled_var <= 1e-12:
+        return 0.0
+    d = (mean2 - mean1) / math.sqrt(pooled_var)
+    return d
+
+def ci_interval(scores: List[float], ci_level=0.99) -> tuple[float, float]:
+    """
+    Compute mean ± z*(stdev/sqrt(n)) for the specified CI level.
+    Returns (low, high).
+    """
+    if len(scores) < 2:
+        # trivial or empty
+        mean_ = statistics.mean(scores) if len(scores) == 1 else 0.0
+        return (mean_, mean_)
+    mean_ = statistics.mean(scores)
+    stdev_ = statistics.stdev(scores)
+    n = len(scores)
+    z = scipy.stats.norm.ppf(0.5 + ci_level/2.0)  # ~2.575 for 99% CI
+    half_width = z * (stdev_ / math.sqrt(n))
+    return (mean_ - half_width, mean_ + half_width)
+
+def ci_intervals_overlap(ci1: tuple[float, float], ci2: tuple[float, float]) -> bool:
+    """
+    Returns True if two confidence intervals overlap.
+    """
+    return not (ci1[1] < ci2[0] or ci2[1] < ci1[0])
+
+def compute_distributions_distance(scores_by_model: Dict[str, List[float]]):
+    """
+    Example EMD computation across all pairs, if you still want it.
+    Returns an average distance plus each pair's distance.
+    """
+    models = list(scores_by_model.keys())
+    distances = {}
+    sum_dist = 0.0
+    pair_count = 0
+
+    for i in range(len(models)):
+        for j in range(i+1, len(models)):
+            mA, mB = models[i], models[j]
+            d = -1.0
+            if HAS_WASSERSTEIN and scores_by_model[mA] and scores_by_model[mB]:
+                d = wasserstein_distance(scores_by_model[mA], scores_by_model[mB])
+            distances[f"{mA}__{mB}"] = d
+            if d >= 0.0:
+                sum_dist += d
+            pair_count += 1
+
+    avg_dist = (sum_dist / pair_count) if pair_count else 0.0
+    return {
+        "average": avg_dist,
+        "pairs": distances
+    }
+
+def compute_average_ci95(model_scores: Dict[str, List[float]]) -> float:
+    """
+    Compute the average 95% CI half-width across models.
+    """
+    if not model_scores:
+        return 0.0
+    half_widths = []
+    z95 = 1.96
+    for scores in model_scores.values():
+        if len(scores) < 2:
+            half_widths.append(0.0)
+            continue
+        stdev_ = statistics.stdev(scores)
+        mean_ = statistics.mean(scores)
+        n = len(scores)
+        hw = z95 * (stdev_ / math.sqrt(n))
+        half_widths.append(hw)
+    return statistics.mean(half_widths) if half_widths else 0.0
+
+def compute_separability_metrics(
+    run_data: dict,
+    scores_by_model: Dict[str, List[float]],
+    label: str = "raw",
+    scale_factor: float = 1.5,
+) -> None:
+    """
+    Compute a few custom “separability” metrics:
+     • 99% CI overlap only for adjacent models (fraction)
+     • The *magnitude* of 99% CI overlap between adjacent models (with optional scaling of intervals)
+     • Single summary measure of Cohen’s d (e.g., average of absolute values)
+     • EMD across pairs (optional)
+     • Weighted or “modulated” metric for average CI95
+
+    Args:
+        run_data: A dictionary to store results.
+        scores_by_model: Dict of model -> list of scores.
+        label: String label for grouping these results in run_data.
+        scale_factor: If >1.0, intervals are expanded by that factor when
+                      computing overlap magnitude. For example, 1.5 means
+                      you increase the half-width of each interval by 50%.
+    """
+    if "separability_metrics" not in run_data:
+        run_data["separability_metrics"] = {}
+    run_data["separability_metrics"][label] = {}
+
+    # ----------------------------------------------------------------
+    # 1) Basic stats: model means + 99% CI
+    # ----------------------------------------------------------------
+    model_means = {}
+    model_ci99 = {}
+    for m, sc in scores_by_model.items():
+        if sc:
+            model_means[m] = statistics.mean(sc)
+            model_ci99[m] = ci_interval(sc, ci_level=0.99)
+        else:
+            model_means[m] = 0.0
+            model_ci99[m] = (0.0, 0.0)
+
+    # Sort models by mean descending
+    models_sorted = sorted(model_means.keys(), key=lambda x: model_means[x], reverse=True)
+
+    # ----------------------------------------------------------------
+    # 2) Original “adjacent overlap fraction” (no scaling)
+    # ----------------------------------------------------------------
+    adjacent_overlap = {}
+    overlap_count = 0
+    for i in range(len(models_sorted) - 1):
+        mA, mB = models_sorted[i], models_sorted[i + 1]
+        overlap = ci_intervals_overlap(model_ci99[mA], model_ci99[mB])
+        adjacent_overlap[f"{mA}__{mB}"] = overlap
+        if overlap:
+            overlap_count += 1
+
+    adj_frac_overlap = overlap_count / (len(models_sorted) - 1) if len(models_sorted) > 1 else 0.0
+
+    # ----------------------------------------------------------------
+    # 3) “Magnitude” of 99% CI overlap between adjacent models
+    #    with optional scaling factor
+    # ----------------------------------------------------------------
+    def scale_interval(ci: tuple[float, float], factor: float) -> tuple[float, float]:
+        """
+        Given an interval (low, high), expand it about its midpoint by 'factor'.
+        E.g. if factor=1.5, the half-width becomes 1.5 * (original half-width).
+        """
+        low, high = ci
+        mid = (low + high) / 2.0
+        half_width = (high - low) / 2.0
+        new_half = factor * half_width
+        return (mid - new_half, mid + new_half)
+
+    def interval_overlap(ciA: tuple[float, float], ciB: tuple[float, float]) -> float:
+        """Return the length of the overlap between two intervals."""
+        return max(0.0, min(ciA[1], ciB[1]) - max(ciA[0], ciB[0]))
+
+    adjacent_overlap_magnitude = {}
+    sum_overlap_magnitude = 0.0
+    for i in range(len(models_sorted) - 1):
+        mA, mB = models_sorted[i], models_sorted[i + 1]
+        # Scale each interval before computing overlap
+        scaledA = scale_interval(model_ci99[mA], scale_factor)
+        scaledB = scale_interval(model_ci99[mB], scale_factor)
+        overlap_mag = interval_overlap(scaledA, scaledB)
+        adjacent_overlap_magnitude[f"{mA}__{mB}"] = overlap_mag
+        sum_overlap_magnitude += overlap_mag
+
+    # ----------------------------------------------------------------
+    # 4) Single measure for Cohen’s d (average of absolute Cohen’s d across adjacent pairs)
+    # ----------------------------------------------------------------
+    d_vals = []
+    for i in range(len(models_sorted) - 1):
+        mA, mB = models_sorted[i], models_sorted[i + 1]
+        d_val = cohen_d(scores_by_model[mA], scores_by_model[mB])
+        d_vals.append(abs(d_val))
+    avg_cohens_d = sum(d_vals) / len(d_vals) if d_vals else 0.0
+
+    # ----------------------------------------------------------------
+    # 5) Optional EMD across all pairs
+    # ----------------------------------------------------------------
+    emd_data = compute_distributions_distance(scores_by_model)
+
+    # ----------------------------------------------------------------
+    # 6) Weighted or modulated average CI95
+    # ----------------------------------------------------------------
+    avg_ci95 = compute_average_ci95(scores_by_model)
+    norm_ci95 = normalize(avg_ci95, 0.15, 0.45, False)
+    norm_cohens_d = normalize(avg_cohens_d, 0, 0.4)
+    modulated_ci95 = norm_ci95 * norm_cohens_d
+
+    # ----------------------------------------------------------------
+    # Store or log results
+    # ----------------------------------------------------------------
+    metrics_label = run_data["separability_metrics"][label]
+    metrics_label["ci99_overlap_adjacent"] = adjacent_overlap
+    metrics_label["adjacent_overlap_fraction"] = adj_frac_overlap
+
+    # New overlap magnitude stats (with scaling)
+    metrics_label["ci99_overlap_magnitude_adjacent"] = adjacent_overlap_magnitude
+    metrics_label["ci99_overlap_magnitude_sum"] = sum_overlap_magnitude
+    metrics_label["ci99_overlap_scale_factor"] = scale_factor
+
+    metrics_label["average_cohens_d_adjacent"] = avg_cohens_d
+    metrics_label["emd"] = emd_data
+    metrics_label["average_ci95"] = avg_ci95
+    metrics_label["modulated_ci95"] = modulated_ci95
+
+    # Logging summary
+    logging.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
+    logging.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
+    logging.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
+                 f"{sum_overlap_magnitude:.3f}")
+    logging.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
+    logging.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
+    logging.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
@@ -0,0 +1,448 @@
+import logging
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from typing import Dict
+
+from utils.api import send_to_judge_model
+from utils.file_io import save_json_file
+from core.scoring import parse_scores, compute_raw_score
+from config.constants import STABILITY_ITEMS, STABILITY_REPS
+from utils.state import should_exit, executor
+from collections import defaultdict
+import statistics
+import math
+import scipy.stats
+import random
+import statistics
+
+def extract_model_item_scores(run_data: dict, label: str = "raw"):
+    """
+    Collect a dictionary:
+      model_item_scores[model][item_id] = { iteration_key: score }
+    for either aggregated_score_raw or aggregated_score_calibrated (controlled by 'label').
+
+    Also returns a sorted list of iteration_keys found in the data.
+    We assume (model, item_id) either has exactly one score per iteration or is skipped.
+    """
+    results = run_data.get("results", {})
+    model_item_scores = defaultdict(lambda: defaultdict(dict))
+    all_iteration_keys = set()
+
+    for model_name, model_dict in results.items():
+        if not isinstance(model_dict, dict):
+            continue
+        for it_key, item_dict in model_dict.items():
+            if not isinstance(item_dict, dict):
+                continue
+            all_iteration_keys.add(it_key)
+            for item_id, item_info in item_dict.items():
+                if not isinstance(item_info, dict):
+                    continue
+                if label == "raw":
+                    score_val = item_info.get("aggregated_score_raw", None)
+                else:
+                    score_val = item_info.get("aggregated_score_calibrated", None)
+                
+                if isinstance(score_val, (int, float)):
+                    model_item_scores[model_name][item_id][it_key] = score_val
+    
+    sorted_iteration_keys = sorted(list(all_iteration_keys))
+    return model_item_scores, sorted_iteration_keys
+
+
+def shuffle_iteration_assignments_by_item(
+    model_item_scores, 
+    iteration_keys
+):
+    """
+    For each model, for each item_id, we have a dict of {it_key: score}, one entry per iteration.
+    We do a random permutation among iteration_keys so that:
+
+      For k in range(len(iteration_keys)):
+        * The score originally from iteration_keys[perm[k]] is assigned to iteration_keys[k].
+
+    This ensures each iteration_key gets exactly one sample for each item_id,
+    but “which original iteration’s score” is placed there is random.
+
+    Returns: iteration_averages = { new_it_key: { model: [scores...] } }
+
+    We'll later convert those lists to mean(...) for each model.
+    """
+    # structure to gather new assignment
+    # iteration_averages[new_it_key][model] = list of assigned scores
+    from collections import defaultdict
+    iteration_averages = defaultdict(lambda: defaultdict(list))
+    n_iters = len(iteration_keys)
+    
+    for model_name, item_dict in model_item_scores.items():
+        for item_id, it_score_map in item_dict.items():
+            # We expect one score for each iteration key
+            # If item_id is missing an iteration, that item won't be shuffled properly.
+            # We can skip or handle partial if needed. Here, assume it's complete:
+            if len(it_score_map) < n_iters:
+                # skip incomplete
+                continue
+            
+            # Build an array of scores in sorted iteration_keys order
+            # so scores[i] corresponds to iteration_keys[i] in the original
+            scores_in_order = [it_score_map[k] for k in iteration_keys]
+            
+            # random permutation of [0..n_iters-1]
+            permutation = random.sample(range(n_iters), k=n_iters)
+            
+            # Now assign scores_in_order[permutation[k]] to iteration_keys[k].
+            for k in range(n_iters):
+                new_it_key = iteration_keys[k]
+                old_index = permutation[k]
+                assigned_score = scores_in_order[old_index]
+                iteration_averages[new_it_key][model_name].append(assigned_score)
+    
+    return iteration_averages
+
+
+def compute_average_iteration_averages(iteration_averages):
+    """
+    iteration_averages is { new_it_key: { model_name: [scores...] }}
+    Convert that to { new_it_key: { model_name: mean_score }}.
+    """
+    final_avg = {}
+    for it_key, model_dict in iteration_averages.items():
+        final_avg[it_key] = {}
+        for m, score_list in model_dict.items():
+            if score_list:
+                final_avg[it_key][m] = statistics.mean(score_list)
+            else:
+                final_avg[it_key][m] = 0.0
+    return final_avg
+
+
+def compute_kendall_tau_for_iterations(iteration_averages):
+    """
+    iteration_averages: { it_key: { model_name: avg_score } }
+    We'll build rank lists, then compare each pair of iteration_keys for Kendall’s τ.
+    Returns average τ across iteration pairs, or 0.0 if not enough data.
+    """
+    all_iter_keys = sorted(iteration_averages.keys())
+    
+    # Build rank ordering
+    iteration_rank = {}
+    for it_key in all_iter_keys:
+        items = list(iteration_averages[it_key].items())  # (model, avg_score)
+        # sort descending by score
+        items.sort(key=lambda x: x[1], reverse=True)
+        iteration_rank[it_key] = [m for (m, sc) in items]
+    
+    pairwise_taus = []
+    for i in range(len(all_iter_keys)):
+        for j in range(i+1, len(all_iter_keys)):
+            ik = all_iter_keys[i]
+            jk = all_iter_keys[j]
+            rank_i = iteration_rank[ik] 
+            rank_j = iteration_rank[jk]
+            common_models = set(rank_i).intersection(rank_j)
+            if len(common_models) < 2:
+                continue
+            pos_i = {m: rank_i.index(m) for m in common_models}
+            pos_j = {m: rank_j.index(m) for m in common_models}
+            x = []
+            y = []
+            for m in common_models:
+                x.append(pos_i[m])
+                y.append(pos_j[m])
+            if len(x) > 1:
+                tau, pval = scipy.stats.kendalltau(x, y)
+                if not math.isnan(tau):
+                    pairwise_taus.append(tau)
+    if pairwise_taus:
+        return statistics.mean(pairwise_taus)
+    else:
+        return 0.0
+
+
+def compute_randomized_iteration_rank_stability_by_item(
+    run_data: dict,
+    label: str = "raw",
+    n_shuffles: int = 1000
+) -> float:
+    """
+    For each item_id in each model, we shuffle which iteration key 
+    receives that item’s original score (a random permutation among 
+    the iteration_keys).
+    
+    This ensures that each iteration_key still has exactly one sample for each item
+    (thus preserving the structure of “N iterations, M items each”), 
+    but randomizes *which original iteration’s score* ended up in each iteration.
+    
+    Then we compute the average Kendall's τ across iteration pairs 
+    (ranking stability). We repeat n_shuffles times, returning the 
+    average of those τ values.
+    
+    We'll store the result in:
+      run_data["iteration_stability"][label]["randomized_average_kendall_tau_by_item"].
+    
+    This approach is more efficient than reconstructing big data structures 
+    or re-running a large function every time.
+    """
+    model_item_scores, iteration_keys = extract_model_item_scores(run_data, label=label)
+    
+    if len(iteration_keys) < 2:
+        return 0.0  # no real iteration comparison
+    
+    tau_values = []
+    for _ in range(n_shuffles):
+        # Step 1: Randomly assign each item’s scores to iteration_keys
+        iteration_averages_shuffled = shuffle_iteration_assignments_by_item(model_item_scores, iteration_keys)
+        # Step 2: Convert from [list of scores] to mean
+        final_avg = compute_average_iteration_averages(iteration_averages_shuffled)
+        # Step 3: Compute rank correlation across iteration_keys
+        tau = compute_kendall_tau_for_iterations(final_avg)
+        tau_values.append(tau)
+    
+    if tau_values:
+        randomized_avg_tau = statistics.mean(tau_values)
+    else:
+        randomized_avg_tau = 0.0
+    
+    # Store in run_data
+    if "iteration_stability" not in run_data:
+        run_data["iteration_stability"] = {}
+    if label not in run_data["iteration_stability"]:
+        run_data["iteration_stability"][label] = {}
+    run_data["iteration_stability"][label]["randomized_average_kendall_tau_by_item"] = randomized_avg_tau
+    
+    return randomized_avg_tau
+
+
+def compute_iteration_stability(run_data: dict, label="raw"):
+    """
+    Example function that:
+      1) Gathers iteration-average scores for each model.
+      2) Computes 'scoring stability' across iterations.
+      3) Computes 'ranking stability' across iterations.
+      4) Stores results in run_data["iteration_stability"][label].
+      
+    label could be "raw" or "calibrated"—adjust as needed.
+    """
+    
+    # Make a place to store results:
+    if "iteration_stability" not in run_data:
+        run_data["iteration_stability"] = {}
+    run_data["iteration_stability"][label] = {}
+    
+    results = run_data.get("results", {})
+    
+    # 1) Gather iteration_averages[iter_key][model] = average_score
+    iteration_averages = defaultdict(dict)  # iter_key -> {model -> avg_score}
+    
+    for model_name, model_data in results.items():
+        # model_data: iter_key -> item_dict
+        if not isinstance(model_data, dict):
+            continue
+        for iter_key, item_dict in model_data.items():
+            if not isinstance(item_dict, dict):
+                continue
+            
+            # Collect all items' scores for (this model, iter_key)
+            scores = []
+            for item_id, item_info in item_dict.items():
+                if not isinstance(item_info, dict):
+                    continue
+                if label == "raw":
+                    val = item_info.get("aggregated_score_raw", None)
+                else:
+                    val = item_info.get("aggregated_score_calibrated", None)
+                
+                if isinstance(val, (int, float)):
+                    scores.append(val)
+            
+            if scores:
+                iteration_averages[iter_key].setdefault(model_name, [])
+                iteration_averages[iter_key][model_name] = statistics.mean(scores)
+    
+    # 2) Scoring Stability
+    # We can do something like: for each model, gather all iteration_averages and store stdev
+    model_stability = {}
+    
+    # We also want to track how many times each model appears (some might not appear in all iter_keys)
+    all_iter_keys = sorted(iteration_averages.keys())
+    
+    for model_name in results.keys():
+        # gather the iteration means
+        vals = []
+        for it_key in all_iter_keys:
+            if model_name in iteration_averages[it_key]:
+                vals.append(iteration_averages[it_key][model_name])
+        if len(vals) > 1:
+            stdev_ = statistics.pstdev(vals) if len(vals) > 1 else 0.0
+            model_stability[model_name] = {
+                "mean_iter_score": statistics.mean(vals),
+                "iteration_count": len(vals),
+                "stdev_across_iters": stdev_
+            }
+        else:
+            model_stability[model_name] = {
+                "mean_iter_score": vals[0] if vals else 0.0,
+                "iteration_count": len(vals),
+                "stdev_across_iters": 0.0
+            }
+    
+    run_data["iteration_stability"][label]["scoring_stability"] = model_stability
+    
+    # 3) Ranking Stability
+    # For each iteration, produce a list of (model, iteration_average)
+    # Then create a rank ordering. We'll store them in a dictionary: iteration_rank[iter_key] = [model1, model2, ...]
+    
+    iteration_rank = {}
+    for iter_key in all_iter_keys:
+        # build a list of (model, avg_score) then sort descending
+        items = list(iteration_averages[iter_key].items())
+        items.sort(key=lambda x: x[1], reverse=True)
+        # The rank order is just model names in sorted order:
+        iteration_rank[iter_key] = [m for (m, sc) in items]
+    
+    # now for each pair (i, j) of iteration_keys, compute rank correlation
+    # you can pick Kendall or Spearman. We'll do Kendall as example:
+    
+    pairwise_rank_corr = {}
+    sorted_iter_keys = list(all_iter_keys)
+    
+    for i in range(len(sorted_iter_keys)):
+        for j in range(i+1, len(sorted_iter_keys)):
+            ik = sorted_iter_keys[i]
+            jk = sorted_iter_keys[j]
+
+            rank_i = iteration_rank[ik]  # not iteration_rank[i]
+            rank_j = iteration_rank[jk]  # not iteration_rank[j]
+            # build positions
+            # If a model is missing in either iteration, skip it
+            common_models = set(rank_i).intersection(rank_j)
+            # create index mappings
+            pos_i = {m: rank_i.index(m) for m in common_models}
+            pos_j = {m: rank_j.index(m) for m in common_models}
+            
+            # now build x/y from pos_i, pos_j
+            x = []
+            y = []
+            for m in common_models:
+                x.append(pos_i[m])
+                y.append(pos_j[m])
+            
+            if len(x) > 1:
+                tau, pval = scipy.stats.kendalltau(x, y)
+            else:
+                tau, pval = (0.0, 1.0)
+            
+            key_name = f"{ik}__vs__{jk}"
+            pairwise_rank_corr[key_name] = {
+                "common_model_count": len(common_models),
+                "kendall_tau": tau,
+                "p_value": pval
+            }
+    
+    run_data["iteration_stability"][label]["ranking_stability"] = {
+        "pairwise_correlation": pairwise_rank_corr
+    }
+    
+    # Optionally, you might compute an "average" or "median" rank correlation across all iteration pairs
+    if pairwise_rank_corr:
+        all_taus = [v["kendall_tau"] for v in pairwise_rank_corr.values() if not math.isnan(v["kendall_tau"])]
+        run_data["iteration_stability"][label]["ranking_stability"]["average_kendall_tau"] = statistics.mean(all_taus) if all_taus else 0.0
+
+def process_stability_test_item(model_name, iteration_key, item_id, item_text, prompt_template, judge_model) -> float:
+    """Process a single judge request for the stability test. Returns the aggregated score or 0.0 if failed."""
+    global should_exit
+    if should_exit:
+        return 0.0
+    
+    try:
+        final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
+        final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")
+        
+        messages = [{"role": "user", "content": final_prompt}]
+        judge_response = send_to_judge_model(messages, judge_model=judge_model)
+        
+        extracted_scores = parse_scores(judge_response)
+        item_score = compute_raw_score(extracted_scores)
+        
+        # Only return actual valid scores, never None
+        return item_score if isinstance(item_score, (int, float)) and item_score > 0.0 else 0.0
+    except Exception as e:
+        logging.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}")
+        return 0.0
+
+def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, runs_file, lock, num_threads):
+    """Run stability test, retrying any missing entries to reach STABILITY_REPS per item."""
+    logging.info("Running stability test for selected items...")
+    
+    if "stability_test_results" not in run_data:
+        run_data["stability_test_results"] = {}
+    
+    items_to_process = []
+    for (model, iteration, item_id) in STABILITY_ITEMS:
+        key_name = f"{model}-{iteration}-{item_id}"
+        existing_results = run_data["stability_test_results"].get(key_name, [])
+        
+        # Filter out failed results (0.0 scores, None values) from existing
+        valid_results = [score for score in existing_results 
+                        if isinstance(score, (int, float)) and score > 0.0]
+        needed_count = STABILITY_REPS - len(valid_results)
+        
+        if needed_count > 0:
+            item_text = samples_data.get(model, {}).get("samples", {}).get(iteration, {}).get(item_id, "")
+            prompt_template = judge_prompts.get(item_id, "")
+            
+            for _ in range(needed_count):
+                items_to_process.append({
+                    "model": model,
+                    "iteration": iteration,
+                    "item_id": item_id,
+                    "item_text": item_text,
+                    "prompt_template": prompt_template,
+                    "key_name": key_name
+                })
+            
+            logging.info(f"Need {needed_count} more stability test results for {key_name}")
+            
+            # Clean up existing results, keeping only valid scores
+            run_data["stability_test_results"][key_name] = valid_results
+    
+    if not items_to_process:
+        logging.info("All stability test items already have complete results")
+        return
+    
+    with ThreadPoolExecutor(max_workers=num_threads) as exec_:
+        futures_to_items = {}
+        
+        # Launch futures for all needed retries
+        for item in items_to_process:
+            if should_exit:
+                break
+            future = exec_.submit(
+                process_stability_test_item,
+                item["model"], item["iteration"], item["item_id"],
+                item["item_text"], item["prompt_template"],
+                judge_model
+            )
+            futures_to_items[future] = item
+        
+        # Collect results as they complete
+        for future in tqdm(concurrent.futures.as_completed(futures_to_items.keys()),
+                         total=len(futures_to_items),
+                         desc="Stability Test"):
+            if should_exit:
+                break
+            try:
+                score = future.result()
+                item = futures_to_items[future]
+                key_name = item["key_name"]
+                
+                # Only store actual valid scores
+                if isinstance(score, (int, float)) and score > 0.0:
+                    with lock:
+                        run_data["stability_test_results"][key_name].append(score)
+                        save_json_file(runs, runs_file)
+                else:
+                    logging.warning(f"Got invalid score for stability item {key_name}, will need retry")
+            except Exception as exc:
+                logging.error(f"Exception in stability test: {exc}")
@@ -0,0 +1,125 @@
+import sys
+import signal
+import argparse
+import logging
+import time
+
+
+from utils.logging_setup import setup_logging, get_verbosity
+from utils.file_io import load_json_file
+from core.benchmark import run_judgemark_v2
+from utils.api import API_KEY
+from utils.state import should_exit, executor
+
+
+def signal_handler(signum, frame):
+    """Handle interrupt signals (SIGINT, SIGTERM)."""
+    global executor, should_exit
+    print(f"\n[DEBUG] Signal {signum} caught!")
+    logging.warning("Signal handler called")
+    should_exit = True
+    time.sleep(0.1)  # Give workers a moment to see the flag
+    if executor:
+        logging.info("Shutting down executor from signal handler")
+        executor.shutdown(wait=False)
+        logging.info("Executor shutdown complete")
+    sys.exit(1)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run Judgemark-v2 Benchmark')
+    parser.add_argument(
+        '--judge-model',
+        required=True,
+        help='Judge model identifier (e.g., openai/gpt-4)'
+    )
+    parser.add_argument(
+        '--samples-file',
+        default="data/judgemark_v2.1_samples.json",
+        help='JSON file containing pre-generated samples from various writer models'
+    )
+    parser.add_argument(
+        '--prompts-file',
+        default="data/judge_prompts.json",
+        help='JSON file containing the partial judge prompts to be filled with test responses'
+    )
+    parser.add_argument(
+        '--runs-file',
+        default="judgemark_v2_runs.json",
+        help='Path to store the Judgemark run results'
+    )
+    parser.add_argument(
+        '--run-id',
+        help='Resume (or create) a run using this base ID, to be combined with the judge model name'
+    )
+    parser.add_argument(
+        '--threads',
+        type=int,
+        default=6,
+        help='Number of threads to use'
+    )
+    parser.add_argument(
+        '--verbosity',
+        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+        help='Set the logging verbosity level'
+    )
+    parser.add_argument(
+        '--num-runs',
+        type=int,
+        default=1,
+        help='Number of benchmark runs to execute'
+    )
+    parser.add_argument(
+        '--save-raw-judge-output',
+        action='store_true',
+        default=False,
+        help='If set, store the raw judge model output in the results JSON (default: false)'
+    )
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    # Register signal handlers
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+    
+    # Reset sentinel
+    should_exit = False
+    
+    # Parse args
+    args = parse_args()
+    
+    # Setup logging
+    verbosity = get_verbosity(args.verbosity)
+    setup_logging(verbosity)
+    logging.debug("Logging initialized")
+    
+    # Check that we have an API key for the judge model
+    if not API_KEY:
+        logging.critical("No OPENAI_API_KEY found in environment variables.")
+        raise ValueError("OPENAI_API_KEY not found in environment variables.")
+    
+    run_ids = []
+    for i in range(1, args.num_runs + 1):
+        if should_exit:
+            break
+        logging.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}")
+        rid = run_judgemark_v2(
+            judge_model=args.judge_model,
+            samples_file=args.samples_file,
+            prompts_file=args.prompts_file,
+            runs_file=args.runs_file,
+            num_threads=args.threads,
+            run_id=args.run_id,
+            save_raw_judge_output=args.save_raw_judge_output
+        )
+        run_ids.append(rid)
+    
+    # Finally, print summary
+    runs = load_json_file(args.runs_file)
+    logging.info("\nAll Judgemark-v2 runs completed:")
+    print("\nAll Judgemark-v2 runs completed:")
+    for rid in run_ids:
+        rd = runs.get(rid, {})
+        final_score = rd.get("final_judgemark_score", "N/A")
+        logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
+        print(f"Run ID: {rid}")
+        print(f"Final Judgemark-v2 Score: {final_score}")
@@ -0,0 +1,3 @@
+matplotlib>=3.7
+transformers>=4.26
+scipy>=1.10
@@ -0,0 +1,16 @@
+model,judgemark_score,judgemark_score_raw,kendall_tau_bootstrapped,std_dev,kw_stat,ci99_overlap_magnitude_sum_norm,calibrated_score_range_norm,modulated_ci95,emd_norm
+ministral/Ministral-3b-instruct,7.5,15.06,0.0,0.099,0.013,0.094,0.858,0.012,0.363
+gpt-4o-mini,44.05,44.35,0.628,0.438,0.298,0.324,3.149,0.109,1.187
+deepseek-ai/deepseek-r1,76.97,71.77,0.863,0.821,0.905,0.683,6.098,0.6,2.195
+claude-3.5-haiku-20241022,77.64,76.88,0.898,0.832,0.891,0.665,5.975,0.575,2.229
+claude-3.5-sonnet-20240620,78.84,74.72,0.905,0.83,0.902,0.664,6.255,0.591,2.232
+microsoft/wizardlm-2-8x22b,55.42,52.86,0.749,0.566,0.463,0.417,4.62,0.198,1.53
+gpt-4o-2024-11-20,82.51,77.75,0.89,0.837,0.985,0.75,6.233,0.814,2.228
+gemini-2.0-flash-thinking-exp-1219,71.79,66.1,0.818,0.765,0.817,0.631,5.519,0.453,2.057
+deepseek-ai/deepseek-r1-distill-llama-70b,59.67,56.86,0.764,0.642,0.6,0.474,4.485,0.243,1.726
+mistralai/mistral-large-instruct-2411,65.63,63.25,0.794,0.732,0.636,0.505,5.873,0.296,1.935
+meta-llama/llama-3.1-8b-instruct,35.72,35.67,0.46,0.401,0.201,0.207,3.232,0.063,1.075
+meta-llama/llama-3.1-405b-instruct,60.09,58.76,0.791,0.633,0.582,0.471,4.515,0.243,1.712
+deepseek/deepseek-r1-distill-qwen-14b,,,,,,,,,
+mistralai/mistral-small-24b-instruct-2501,63.64,61.23,0.808,0.67,0.618,0.504,5.194,0.286,1.802
+liquid/lfm-7b,10.24,14.33,0.139,0.192,0.027,0.0,1.702,0.0,0.584
@@ -0,0 +1,51 @@
+import os
+import time
+import logging
+import requests
+from typing import List, Dict
+from dotenv import load_dotenv
+
+# Load environment variables from .env if present
+load_dotenv()
+
+BASE_URL = os.getenv("OPENAI_API_URL", "https://openrouter.ai/api/v1/chat/completions")
+API_KEY = os.getenv("OPENAI_API_KEY")
+HEADERS = {
+    "Authorization": f"Bearer {API_KEY}",
+    "Content-Type": "application/json"
+}
+MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
+RETRY_DELAY = int(os.getenv("RETRY_DELAY", "5"))
+
+def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int = MAX_RETRIES) -> str:
+    """
+    Sends user messages to the judge model with basic retry logic.
+    Expects an OpenAI-compatible endpoint.
+    """
+    for attempt in range(1, max_retries + 1):
+        try:
+            # temp and top_k are set to produce diversity in judge outputs between runs,
+            # but constrained to be near the model's best answer (since we are doing numerical scoring).
+            data = {
+                "model": judge_model,
+                "messages": messages,
+                "temperature": 0.5,
+                "top_k": 3,
+                "max_tokens": 8096,
+                #"provider": {
+                #    "order": [
+                #        "DeepSeek"
+                #   ]
+                #}
+            }
+            response = requests.post(BASE_URL, headers=HEADERS, json=data)
+            response.raise_for_status()
+            res_json = response.json()
+            return res_json['choices'][0]['message']['content']
+        except Exception as e:
+            logging.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}")
+            if attempt == max_retries:
+                logging.critical(f"Max retries reached for judge model {judge_model}")
+                raise
+            time.sleep(RETRY_DELAY)
+    return ""
@@ -0,0 +1,17 @@
+import json
+import logging
+
+def load_json_file(file_path: str) -> dict:
+    """Loads a JSON file (returns empty if not found)."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        logging.warning(f"File {file_path} not found, returning empty dict.")
+        return {}
+
+def save_json_file(data: dict, file_path: str):
+    """Saves a dict to disk as JSON."""
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=2)
+    logging.debug(f"Saved JSON data to {file_path}")
@@ -0,0 +1,24 @@
+import os
+import logging
+
+def setup_logging(verbosity: str):
+    """Set up logging based on verbosity level."""
+    log_levels = {
+        'DEBUG': logging.DEBUG,
+        'INFO': logging.INFO,
+        'WARNING': logging.WARNING,
+        'ERROR': logging.ERROR,
+        'CRITICAL': logging.CRITICAL
+    }
+    log_level = log_levels.get(verbosity.upper(), logging.INFO)
+    logging.basicConfig(
+        level=log_level,
+        format='%(asctime)s - %(levelname)s - %(threadName)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+def get_verbosity(args_verbosity: str) -> str:
+    """Determine the verbosity level from command-line or environment."""
+    if args_verbosity:
+        return args_verbosity
+    return os.getenv("LOG_VERBOSITY", "INFO")
@@ -0,0 +1,3 @@
+# Global control flags
+should_exit = False
+executor = None
@@ -0,0 +1,11 @@
+def clamp(x: float, low: float = 0.0, high: float = 1.0) -> float:
+    """Clamp a value between low and high."""
+    return max(low, min(x, high))
+
+def normalize(val, min_val, max_val, bigger_is_better=True):
+    if max_val <= min_val:
+        return 0.0
+    norm = (val - min_val) / (max_val - min_val)
+    if not bigger_is_better:
+        norm = 1.0 - norm
+    return clamp(norm)
@@ -0,0 +1,240 @@
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+from typing import Dict
+from scipy.stats import linregress
+from scipy.stats import spearmanr, theilslopes
+from config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
+
+
+def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict):
+    """
+    Produces two figures:
+      • Figure #1 with three subplots side-by-side:
+        (1) Raw Scores bar chart (+ 95% CI)
+        (2) Calibrated Scores bar chart (+ 95% CI)
+        (3) Heatmap of all per-criterion scores across each model (10 - score for negative markers).
+      • Figure #2: A 4×4 grid of mini scatter plots, one per model (up to 16),
+        showing item length (chars) vs. aggregated_score_raw. A linear regression
+        line and correlation stats are included for each model if enough points exist.
+    """
+    # -------------------------------------------------------------------
+    # 1) The main (raw / calibrated / heatmap) figure
+    # -------------------------------------------------------------------
+    raw_stats = run_data["raw_model_stats"]
+    cal_stats = run_data["calibrated_model_stats"]
+
+    if judge_model in MODEL_NAME_REPLACEMENTS:
+        judge_model = MODEL_NAME_REPLACEMENTS[judge_model]
+    
+    # All model names in raw_stats
+    model_names = list(raw_stats.keys())
+
+    # Convert to arrays for sorting
+    raw_means = [raw_stats[m]["mean"] for m in model_names]
+    cal_means = [cal_stats[m]["mean"] for m in model_names]
+    raw_cis   = [raw_stats[m]["ci95"] for m in model_names]
+    cal_cis   = [cal_stats[m]["ci95"] for m in model_names]
+    
+    # Sort by calibrated score descending
+    sorted_indices = np.argsort(cal_means)[::-1]
+    model_names = [model_names[i] for i in sorted_indices]
+    raw_means   = [raw_means[i]   for i in sorted_indices]
+    cal_means   = [cal_means[i]   for i in sorted_indices]
+    raw_cis     = [raw_cis[i]     for i in sorted_indices]
+    cal_cis     = [cal_cis[i]     for i in sorted_indices]
+    
+    # 1.A) Build data for the heatmap: per-criterion scores (with negative flips)
+    #     We gather them from run_data "parsed_scores"
+    #     Then we convert them to 0..10 bins and store percentage distribution.
+    all_scores_by_model = {m: [] for m in model_names}
+    results = run_data.get("results", {})
+    
+    for model_name in model_names:
+        iter_dict = results.get(model_name, {})
+        for iteration_key, item_dict in iter_dict.items():
+            if not isinstance(item_dict, dict):
+                continue
+            for item_id, item_info in item_dict.items():
+                if not isinstance(item_info, dict):
+                    continue
+                parsed_scores = item_info.get("parsed_scores", {})
+                if not isinstance(parsed_scores, dict):
+                    continue
+                for crit_name, val in parsed_scores.items():
+                    if isinstance(val, (int, float)) and 0 <= val <= 10:
+                        crit_lower = crit_name.strip().lower()
+                        # Flip negative
+                        if any(nm in crit_lower for nm in NEGATIVE_MARKERS):
+                            final_val = 10 - val
+                        else:
+                            final_val = val
+                        all_scores_by_model[model_name].append(final_val)
+    
+    # Convert to a 2D array for the heatmap (rows = models, columns = bins)
+    bins = np.linspace(0, 10, 11)
+    heatmap_rows = []
+    for m in model_names:
+        scores = all_scores_by_model[m]
+        if scores:
+            counts, _ = np.histogram(scores, bins=bins)
+            pct = (counts / len(scores)) * 100.0
+        else:
+            pct = np.zeros(len(bins)-1, dtype=float)
+        heatmap_rows.append(pct)
+    heatmap_data = np.array(heatmap_rows, dtype=float)
+    
+    # 1.B) Plot the main figure with 3 subplots
+    fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7))  # Increased height
+
+    # Increase the base font size for all text elements
+    plt.rcParams.update({'font.size': 14})
+    
+    # (A) Raw bar chart    
+    y_pos = np.arange(len(model_names))
+    ax1.barh(y_pos, raw_means, color='skyblue', alpha=0.7)
+    for i, (mean_val, ci95) in enumerate(zip(raw_means, raw_cis)):
+        ax1.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
+    ax1.set_yticks(y_pos)
+    ax1.set_yticklabels(model_names, fontsize=12)
+    ax1.invert_yaxis()
+    ax1.set_xlabel("Raw Scores", fontsize=14)
+    ax1.set_title("Raw Model Scores (95% CI)", fontsize=16)
+    ax1.grid(True, alpha=0.3)
+    ax1.tick_params(axis='y', pad=10)
+
+    # (B) Calibrated bar chart
+    ax2.barh(y_pos, cal_means, color='lightgreen', alpha=0.7)
+    for i, (mean_val, ci95) in enumerate(zip(cal_means, cal_cis)):
+        ax2.errorbar(mean_val, i, xerr=ci95, color='red', capsize=5)
+    ax2.set_yticks(y_pos)
+    ax2.set_yticklabels(model_names, fontsize=12)
+    ax2.invert_yaxis()
+    ax2.set_xlabel("Calibrated Scores", fontsize=14)
+    ax2.set_title("Calibrated Model Scores (95% CI)", fontsize=16)
+    ax2.grid(True, alpha=0.3)
+    ax2.tick_params(axis='y', pad=10)
+
+    # (C) Heatmap
+    bin_edges = bins[:-1]
+    ax3.set_xticks(np.arange(len(bin_edges)))
+    ax3.set_xticklabels([str(int(be)) for be in bin_edges])
+    im = ax3.imshow(heatmap_data, aspect='auto', origin='upper', cmap='plasma')
+    #ax3.set_xticks(np.arange(len(bin_centers)))
+    #ax3.set_xticklabels([f"{bc:.0f}" for bc in bin_centers], fontsize=12)
+    ax3.set_yticks(np.arange(len(model_names)))
+    ax3.set_yticklabels(model_names, fontsize=12)
+    ax3.set_xlabel("Score Bin (0–10)", fontsize=14)
+    ax3.set_title("Per-Criterion Score Distribution (Heatmap)", fontsize=16)
+    ax3.tick_params(axis='y', pad=10)
+    
+    cbar = plt.colorbar(im, ax=ax3)
+    cbar.set_label("% of Criteria in Bin", fontsize=14)
+    cbar.ax.tick_params(labelsize=12)
+    cbar.formatter.set_powerlimits((0, 0))
+    cbar.formatter = ticker.PercentFormatter(decimals=1)
+    cbar.update_ticks()
+
+    # Overall title
+    
+    sanitized_judge = re.sub(r"[^\w\-]", "-", judge_model.replace("/", "__"))
+    fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}", fontsize=20)
+    
+    # Adjust layout with more space
+    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+    
+    plt.savefig(f"results/charts/judgemark_3chart_{sanitized_judge}.png", 
+                bbox_inches='tight', 
+                dpi=150,
+                pad_inches=0.5)
+    plt.close(fig1)
+
+    # -------------------------------------------------------------------
+    # 2) Second Figure: A 4×4 grid of scatter plots (per-model), 
+    #    where X = item length, Y = aggregated_score_raw.
+    #    We'll gather text straight from samples_data, compute len(...).
+    # -------------------------------------------------------------------
+    # If you want exactly 16 models, skip any extra or exclude some.
+    # Example: exclude "gemini-1.5-pro-001" 
+    excluded_models = {"gemini-1.5-pro-001"}
+    model_list_for_scatter = [m for m in model_names if m not in excluded_models]
+    
+    # If it's still longer than 16, slice it to 16
+    if len(model_list_for_scatter) > 16:
+        model_list_for_scatter = model_list_for_scatter[:16]
+
+    # Build figure and subplots: 4x4
+    fig2, axes2 = plt.subplots(4, 4, figsize=(20, 20))  # each cell is a scatter
+    fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}", fontsize=18)
+    
+    # We might have fewer than 16 models. We'll track them by row & col.
+    for idx, mname in enumerate(model_list_for_scatter):
+        row = idx // 4
+        col = idx % 4
+        ax = axes2[row, col]
+        
+        # Collect all (length, raw_score) for this model
+        length_vals = []
+        score_vals  = []
+        
+        # For each (iteration_key, item_id), find the text in samples_data,
+        # find aggregated_score_raw in run_data, then store pairs
+        model_res = run_data["results"].get(mname, {})
+        for it_key, it_dict in model_res.items():
+            if not isinstance(it_dict, dict):
+                continue
+            for item_id, item_info in it_dict.items():
+                if not isinstance(item_info, dict):
+                    continue
+                raw_score = item_info.get("aggregated_score_raw", None)
+                if not isinstance(raw_score, (int, float)):
+                    continue
+
+                # Look up the text in samples_data:
+                text = (samples_data
+                        .get(mname, {})
+                        .get("samples", {})
+                        .get(it_key, {})
+                        .get(item_id, "")) 
+                text_len = len(text)
+
+                # If it's non-empty text
+                if text_len > 0:
+                    length_vals.append(text_len)
+                    score_vals.append(raw_score)
+        
+        ax.set_title(mname, fontsize=12)
+        ax.set_xlabel("Length")
+        ax.set_ylabel("Raw Score")
+        
+        if len(length_vals) > 1:
+            ax.scatter(length_vals, score_vals, alpha=0.4, color='blue')
+            
+            # -- Rank-based correlation (Spearman) --
+            rho, p_value = spearmanr(length_vals, score_vals)
+            
+            # -- Robust linear fit (Theil-Sen) --
+            # returns slope, intercept, lower_slope, upper_slope
+            slope, intercept, lo_slope, hi_slope = theilslopes(score_vals, length_vals, alpha=0.95)
+            
+            # Build the line
+            xline = np.linspace(min(length_vals), max(length_vals), 200)
+            yline = slope * xline + intercept
+            ax.plot(xline, yline, color='red', linewidth=2,
+                    label=f"Spearman ρ={rho:.2f}, p={p_value:.2g}")
+            ax.legend(loc="best")
+        else:
+            ax.text(0.5, 0.5, "Not enough data",
+                    ha='center', va='center', color='red', transform=ax.transAxes)
+    
+    # If we have fewer than 16 models, blank out remaining subplots
+    total_subplots = 16
+    for i in range(len(model_list_for_scatter), total_subplots):
+        row = i // 4
+        col = i % 4
+        axes2[row, col].axis("off")
+    
+    plt.tight_layout()
+    plt.savefig(f"results/charts/judgemark_scattergrid_{sanitized_judge}.png", bbox_inches='tight', dpi=200)
+    plt.close(fig2)