Judgemark-v2lp/core/benchmark.py

import os
import re
import uuid
import time
import signal
import logging
import threading
import concurrent.futures
from datetime import datetime
from typing import Dict, List
from tqdm import tqdm
from collections import defaultdict

from utils.file_io import load_json_file, save_json_file
from utils.api import send_to_judge_model
from utils.visualization import create_side_by_side_score_charts
import statistics
from core.scoring import (
    parse_scores, compute_raw_score, compute_detailed_distribution,
    compute_model_level_stats, compute_cross_model_stats,
    build_landmark_calibration_config, apply_landmark_calibration,
    log_score_summary, confidence_interval_95
)
from core.scoring import compute_detailed_distribution, compute_detailed_distribution  # etc
from core.separability import compute_separability_metrics
from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
from utils.stats import clamp
from utils.state import should_exit, executor
from utils.stats import normalize

def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str,
                  prompt_template: str, run_key: str, runs: Dict, runs_file: str,
                  lock: threading.Lock, judge_model: str, save_raw_judge_output: bool):
    """Process a single sample, retrying failed or empty results."""
    global should_exit
    if should_exit:
        return

    text_len = len(item_text)
    run_data = runs.get(run_key, {})
    results = run_data.get("results", {})
    model_dict = results.setdefault(model_name, {})
    iteration_dict = model_dict.setdefault(iteration_key, {})

    existing_item = iteration_dict.get(item_id, {})
    if (existing_item and
        "aggregated_score_raw" in existing_item and
        existing_item.get("parsed_scores") and
        len(existing_item["parsed_scores"]) >= 10 and
        existing_item["aggregated_score_raw"] > 0.0):
        return

    try:
        final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
        final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")

        messages = [{"role": "user", "content": final_prompt}]
        judge_response = send_to_judge_model(messages, judge_model=judge_model)

        extracted_scores = parse_scores(judge_response)
        raw_score = compute_raw_score(extracted_scores)

        with lock:
            storage_dict = {
                "parsed_scores": extracted_scores,
                "timestamp": datetime.now().isoformat(),
                "text_length": text_len
            }
            if raw_score is not None:
                storage_dict["aggregated_score_raw"] = raw_score
            if save_raw_judge_output:
                storage_dict["judge_response"] = judge_response

            iteration_dict[item_id] = storage_dict
            runs[run_key]["results"][model_name][iteration_key] = iteration_dict
            save_json_file(runs, runs_file)

        if raw_score is not None:
            logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
        else:
            logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")

    except Exception as e:
        logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
        with lock:
            iteration_dict[item_id] = {
                "error": str(e),
                "timestamp": datetime.now().isoformat()
            }
            if "errors" not in runs[run_key]:
                runs[run_key]["errors"] = []
            runs[run_key]["errors"].append({
                "model": model_name,
                "iteration": iteration_key,
                "item_id": item_id,
                "error": str(e)
            })
            save_json_file(runs, runs_file)

def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict):
    """
    Compute metrics for both raw and calibrated scores, including stability tests,
    normalized components, and detailed distributions.

    Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
    """
    run_data = runs[run_key]
    results = run_data.get("results", {})

    # 1. Collect raw scores, compute calibration, store calibrated values
    raw_scores_by_model_all = defaultdict(list)
    raw_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
    calibrated_scores_by_model_all = defaultdict(list)
    calibrated_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
    lengths_by_model = {}

    # -- Collect raw scores
    for model_name, iteration_data in results.items():
        if not isinstance(iteration_data, dict):
            continue

        lengths = []
        for it_key, it_val in iteration_data.items():
            if it_key == "__model_stats__":
                continue
            if not isinstance(it_val, dict):
                continue

            for item_id, item_info in it_val.items():
                if (isinstance(item_info, dict) and
                    "aggregated_score_raw" in item_info):
                    raw_score = item_info["aggregated_score_raw"]

                    # Collect raw score globally
                    raw_scores_by_model_all[model_name].append(raw_score)
                    # Collect raw score by iteration
                    raw_scores_by_model_by_iter[model_name][it_key].append(raw_score)

                    # Track text length for analyzing
                    text = (samples_data.get(model_name, {})
                            .get("samples", {})
                            .get(it_key, {})
                            .get(item_id, ""))
                    lengths.append(len(text))

        if len(raw_scores_by_model_all[model_name]) > 0:
            lengths_by_model[model_name] = lengths

    # 2. Distribution + calibration
    all_raw_scores = [s for scores in raw_scores_by_model_all.values() for s in scores]
    run_data["raw_score_distribution"] = compute_detailed_distribution(all_raw_scores)

    calibration_config = build_landmark_calibration_config(all_raw_scores, [0, 3, 5, 7, 10])
    run_data["calibration_config"] = calibration_config

    # Apply calibration
    for model_name, iteration_data in results.items():
        if not isinstance(iteration_data, dict):
            continue

        # Flatten model's raw scores, calibrate them
        raw_list = raw_scores_by_model_all[model_name]
        calibrated = [apply_landmark_calibration(s, calibration_config) for s in raw_list]

        # Re-walk iteration_data to assign each calibration back
        idx = 0
        for it_key, it_val in iteration_data.items():
            if it_key == "__model_stats__":
                continue
            if not isinstance(it_val, dict):
                continue
            for item_id, item_info in it_val.items():
                if (isinstance(item_info, dict) and
                    "aggregated_score_raw" in item_info):
                    item_info["aggregated_score_calibrated"] = calibrated[idx]
                    idx += 1

        # Update calibrated_scores_by_model_by_iter in the same breakdown
        idx2 = 0
        for it_key in raw_scores_by_model_by_iter[model_name]:
            count_for_iter = len(raw_scores_by_model_by_iter[model_name][it_key])
            these_cals = calibrated[idx2 : idx2 + count_for_iter]
            calibrated_scores_by_model_by_iter[model_name][it_key].extend(these_cals)
            idx2 += count_for_iter

        # Populate the single flattened list of calibrated scores
        calibrated_scores_by_model_all[model_name].extend(calibrated)

    # 3. Calibrated distributions
    all_calibrated_scores = [
        s for scores in calibrated_scores_by_model_all.values() for s in scores
    ]
    run_data["calibrated_score_distribution"] = compute_detailed_distribution(all_calibrated_scores)

    # 4. Model-level stats
    run_data["raw_model_stats"] = compute_model_level_stats(raw_scores_by_model_all, lengths_by_model)
    run_data["calibrated_model_stats"] = compute_model_level_stats(calibrated_scores_by_model_all, lengths_by_model)

    # 5. Cross-model stats
    run_data["raw_cross_model_stats"] = compute_cross_model_stats(
        scores_by_model_all=raw_scores_by_model_all,
        scores_by_model_by_iter=raw_scores_by_model_by_iter
    )
    run_data["calibrated_cross_model_stats"] = compute_cross_model_stats(
        scores_by_model_all=calibrated_scores_by_model_all,
        scores_by_model_by_iter=calibrated_scores_by_model_by_iter
    )

    # 6. Separability metrics
    compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
    compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")


    # 8. Compute iteration stability for raw & calibrated
    compute_iteration_stability(run_data, label="raw")
    compute_iteration_stability(run_data, label="calibrated")
    random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
    random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
    logging.info("Score stability (RAW)")
    logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
    logging.info("Score stability (CALIBRATED)")
    logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
                 f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")

    # 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)

    # -- (A) RAW Judgemark
    # Pull out raw stats + separability metrics
    raw_stats = run_data["raw_cross_model_stats"]
    raw_norm = raw_stats["normalized_components"]  # "std_dev", "kw_stat", etc.

    # Add your own normalization steps as needed
    raw_emd = run_data["separability_metrics"]["raw"]["emd"]["average"]
    raw_emd_norm = normalize(raw_emd, 0, 4)
    raw_overlap_mag = run_data["separability_metrics"]["raw"]["ci99_overlap_magnitude_sum"]
    raw_overlap_mag_norm = normalize(raw_overlap_mag, 0, 26, False)
    raw_norm["ci99_overlap_magnitude_sum_norm"] = raw_overlap_mag_norm

    # Range of raw model means
    raw_score_range = (
        max(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
        - min(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
    )
    run_data["raw_score_range"] = raw_score_range
    raw_score_range_norm = normalize(raw_score_range, 0, 8)
    raw_norm["raw_score_range_norm"] = raw_score_range_norm

    # Add Kendall's tau from the randomization-based stability measure
    raw_norm["kendall_tau_bootstrapped"] = normalize(random_tau_raw, 0.4, 1.0)

    # compute an aggregated separability metric
    raw_separability = (
        raw_norm["std_dev"] # std deviation *between* models (separability)
        + raw_norm["kw_stat"] # kruskal-wallis (separability)
        + raw_norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
        + raw_norm["raw_score_range_norm"] # range of assigned scores (separability)
        + run_data["separability_metrics"]["raw"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
        + raw_emd_norm # earth-movers distance (separability)
    ) / 6.0

    # Combine into final raw Judgemark
    final_score_raw = (
        raw_norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
        + raw_norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
        + 4 * raw_separability # aggregate of separability metrics
    ) / 6.0
    run_data["final_judgemark_score_elements_raw"] = {
        "norm_stability_between_iterations": raw_norm["kendall_tau_bootstrapped"],
        "norm_correlation_with_lmsys_arena": raw_norm["kendall_tau"],
        "norm_std_dev_between_models": raw_norm["std_dev"],
        "norm_kruskall_wallis": raw_norm["kw_stat"],
        "norm_ci99_adjacent_overlap": raw_norm["ci99_overlap_magnitude_sum_norm"],
        "norm_score_range": raw_norm["raw_score_range_norm"],
        "norm_intra_model_ci95": run_data["separability_metrics"]["raw"]["modulated_ci95"],
        "norm_earth_movers_distance": raw_emd_norm
    }
    run_data["final_judgemark_score_raw"] = final_score_raw

    # -- (B) Calibrated Judgemark
    cal_stats = run_data["calibrated_cross_model_stats"]
    norm = cal_stats["normalized_components"]

    emd_norm = normalize(run_data["separability_metrics"]["calibrated"]["emd"]["average"], 0, 4)
    overlap_magnitude_norm = normalize(
        run_data["separability_metrics"]["calibrated"]["ci99_overlap_magnitude_sum"], 0, 26, False
    )
    norm["ci99_overlap_magnitude_sum_norm"] = overlap_magnitude_norm

    # Range of calibrated model means
    calibrated_score_range = (
        max(run_data["calibrated_model_stats"][model]["mean"]
            for model in run_data["calibrated_model_stats"])
        - min(run_data["calibrated_model_stats"][model]["mean"]
              for model in run_data["calibrated_model_stats"])
    )
    run_data["calibrated_score_range"] = calibrated_score_range
    calibrated_score_range_norm = normalize(calibrated_score_range, 0, 8)
    norm["calibrated_score_range_norm"] = calibrated_score_range_norm

    # Kendall's tau from the randomized stability measure
    norm["kendall_tau_bootstrapped"] = normalize(random_tau_cal, 0.4, 1.0)

    # compute an aggregated separability metric
    calibrated_separability = (
        norm["std_dev"] # std deviation *between* models (separability)
        + norm["kw_stat"] # kruskal-wallis (separability)
        + norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
        + norm["calibrated_score_range_norm"] # range of assigned scores (separability)
        + run_data["separability_metrics"]["calibrated"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
        + emd_norm # earth-movers distance (separability)
    ) / 6.0

    final_score_calibrated = (
        norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
        + norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
        + 4 * calibrated_separability # aggregate of separability metrics
    ) / 6.0
    run_data["final_judgemark_score_elements_calibrated"] = {
        "norm_stability_between_iterations": norm["kendall_tau_bootstrapped"],
        "norm_correlation_with_lmsys_arena": norm["kendall_tau"],
        "norm_std_dev_between_models": norm["std_dev"],
        "norm_kruskall_wallis": norm["kw_stat"],
        "norm_ci99_adjacent_overlap": norm["ci99_overlap_magnitude_sum_norm"],
        "norm_score_range": norm["calibrated_score_range_norm"],
        "norm_intra_model_ci95": run_data["separability_metrics"]["calibrated"]["modulated_ci95"],
        "norm_earth_movers_distance": norm
    }
    run_data["final_judgemark_score"] = final_score_calibrated

    # 10. Create visualizations + logs
    create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data)

    log_score_summary(
        "RAW SCORES",
        run_data["raw_cross_model_stats"],
        run_data["raw_model_stats"]
    )
    log_score_summary(
        "CALIBRATED SCORES",
        run_data["calibrated_cross_model_stats"],
        run_data["calibrated_model_stats"]
    )

    logging.info(f"Final Judgemark (raw)   = {final_score_raw:.3f}")
    logging.info(f"Final Judgemark (cal)  = {final_score_calibrated:.3f}")


def sanitize_model_name(name: str) -> str:
    """Sanitize judge model name for use in the run key."""
    return re.sub(r'[^a-zA-Z0-9_-]+', '_', name)

def run_judgemark_v2(
    judge_model: str,
    samples_file: str,
    prompts_file: str,
    runs_file: str,
    num_threads: int,
    run_id: str = None,
    save_raw_judge_output: bool = False
) -> str:
    global executor, should_exit

    logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
    runs = load_json_file(runs_file)

    # Form the run key using run_id + "__" + sanitized judge model
    sanitized_jm = sanitize_model_name(judge_model)
    base_id = run_id if run_id else str(uuid.uuid4())
    run_key = f"{base_id}__{sanitized_jm}"

    # Load data files
    samples_data = load_json_file(samples_file)
    judge_prompts = load_json_file(prompts_file)

    # Initialize or get existing run data
    if run_key not in runs:
        runs[run_key] = {
            "judge_model": judge_model,
            "start_time": datetime.now().isoformat(),
            "status": "running",
            "samples_file": samples_file,
            "prompts_file": prompts_file,
            "results": {}
        }
        save_json_file(runs, runs_file)

    run_data = runs[run_key]
    items_to_process = []

    # If run exists, scan for items needing retry
    if "results" in run_data:
        results = run_data.get("results", {})

        # Scan all possible items
        for model_name, model_info in samples_data.items():
            samples_dict = model_info.get("samples", {})
            for iteration_key, iteration_items in samples_dict.items():
                for item_id, item_text in iteration_items.items():
                    # Check if this item needs processing
                    existing_result = (results.get(model_name, {})
                                            .get(iteration_key, {})
                                            .get(item_id, {}))

                    needs_retry = (
                        not existing_result or
                        not existing_result.get("parsed_scores") or
                        len(existing_result.get("parsed_scores", {})) < 10 or
                        existing_result.get("aggregated_score_raw", 0.0) == 0.0 or
                        "error" in existing_result
                    )

                    if needs_retry:
                        items_to_process.append({
                            "model_name": model_name,
                            "iteration_key": iteration_key,
                            "item_id": item_id,
                            "item_text": item_text,
                            "prompt_template": judge_prompts.get(item_id, "")
                        })

        if items_to_process:
            logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
        else:
            logging.info(f"No items to process in existing run {run_key}")

    else:
        # New run - process all items
        for model_name, model_info in samples_data.items():
            samples_dict = model_info.get("samples", {})
            for iteration_key, iteration_items in samples_dict.items():
                print(iteration_key)
                for item_id, item_text in iteration_items.items():
                    items_to_process.append({
                        "model_name": model_name,
                        "iteration_key": iteration_key,
                        "item_id": item_id,
                        "item_text": item_text,
                        "prompt_template": judge_prompts.get(item_id, "")
                    })

    # Ensure concurrency lock
    lock = threading.Lock()

    # Process any items that need retrying
    all_futures = []
    try:
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
            executor = exec_

            if items_to_process:
                # Process all items (either retries or new run)
                for item in items_to_process:
                    if should_exit:
                        break

                    all_futures.append(
                        executor.submit(
                            process_sample,
                            item["model_name"],
                            item["iteration_key"],
                            item["item_id"],
                            item["item_text"],
                            item["prompt_template"],
                            run_key,
                            runs,
                            runs_file,
                            lock,
                            judge_model,
                            save_raw_judge_output
                        )
                    )

                # Display progress bar for tasks
                for f in tqdm(concurrent.futures.as_completed(all_futures),
                              total=len(all_futures), desc="Judging", leave=True):
                    if should_exit:
                        break
                    try:
                        f.result()
                    except Exception as exc:
                        logging.error(f"Exception in worker thread: {exc}")

    except KeyboardInterrupt:
        logging.warning("KeyboardInterrupt caught in main thread.")
        should_exit = True
        time.sleep(0.1)
    finally:
        # Mark run as interrupted or completed
        status = "interrupted" if should_exit else "completed"
        runs[run_key]["status"] = status
        runs[run_key]["end_time"] = datetime.now().isoformat()

        if not should_exit:
            # Run stability test
            if False:
                run_stability_test(
                    run_data, judge_model,
                    judge_prompts, samples_data,
                    runs, runs_file,
                    lock, num_threads
                )
            # Compute final stats
            finalize_scores_and_compute_judgemark(runs, run_key, samples_data)

        # Save final
        save_json_file(runs, runs_file)

        if executor:
            logging.info("Shutting down executor")
            executor.shutdown(wait=False)
            executor = None

    logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
    return run_key