mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
514 lines
22 KiB
Python
514 lines
22 KiB
Python
import os
|
|
import re
|
|
import uuid
|
|
import time
|
|
import signal
|
|
import logging
|
|
import threading
|
|
import concurrent.futures
|
|
from datetime import datetime
|
|
from typing import Dict, List
|
|
from tqdm import tqdm
|
|
from collections import defaultdict
|
|
|
|
from utils.file_io import load_json_file, save_json_file
|
|
from utils.api import send_to_judge_model
|
|
from utils.visualization import create_side_by_side_score_charts
|
|
import statistics
|
|
from core.scoring import (
|
|
parse_scores, compute_raw_score, compute_detailed_distribution,
|
|
compute_model_level_stats, compute_cross_model_stats,
|
|
build_landmark_calibration_config, apply_landmark_calibration,
|
|
log_score_summary, confidence_interval_95
|
|
)
|
|
from core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
|
|
from core.separability import compute_separability_metrics
|
|
from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
|
|
from utils.stats import clamp
|
|
from utils.state import should_exit, executor
|
|
from utils.stats import normalize
|
|
|
|
def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str,
|
|
prompt_template: str, run_key: str, runs: Dict, runs_file: str,
|
|
lock: threading.Lock, judge_model: str, save_raw_judge_output: bool):
|
|
"""Process a single sample, retrying failed or empty results."""
|
|
global should_exit
|
|
if should_exit:
|
|
return
|
|
|
|
text_len = len(item_text)
|
|
run_data = runs.get(run_key, {})
|
|
results = run_data.get("results", {})
|
|
model_dict = results.setdefault(model_name, {})
|
|
iteration_dict = model_dict.setdefault(iteration_key, {})
|
|
|
|
existing_item = iteration_dict.get(item_id, {})
|
|
if (existing_item and
|
|
"aggregated_score_raw" in existing_item and
|
|
existing_item.get("parsed_scores") and
|
|
len(existing_item["parsed_scores"]) >= 10 and
|
|
existing_item["aggregated_score_raw"] > 0.0):
|
|
return
|
|
|
|
try:
|
|
final_prompt = prompt_template.replace("[TEST MODEL RESPONSE]", item_text)
|
|
final_prompt = final_prompt.replace("[TEST MODEL RESPONSE END]", "")
|
|
|
|
messages = [{"role": "user", "content": final_prompt}]
|
|
judge_response = send_to_judge_model(messages, judge_model=judge_model)
|
|
|
|
extracted_scores = parse_scores(judge_response)
|
|
raw_score = compute_raw_score(extracted_scores)
|
|
|
|
with lock:
|
|
storage_dict = {
|
|
"parsed_scores": extracted_scores,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"text_length": text_len
|
|
}
|
|
if raw_score is not None:
|
|
storage_dict["aggregated_score_raw"] = raw_score
|
|
if save_raw_judge_output:
|
|
storage_dict["judge_response"] = judge_response
|
|
|
|
iteration_dict[item_id] = storage_dict
|
|
runs[run_key]["results"][model_name][iteration_key] = iteration_dict
|
|
save_json_file(runs, runs_file)
|
|
|
|
if raw_score is not None:
|
|
logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
|
|
else:
|
|
logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
|
|
with lock:
|
|
iteration_dict[item_id] = {
|
|
"error": str(e),
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
if "errors" not in runs[run_key]:
|
|
runs[run_key]["errors"] = []
|
|
runs[run_key]["errors"].append({
|
|
"model": model_name,
|
|
"iteration": iteration_key,
|
|
"item_id": item_id,
|
|
"error": str(e)
|
|
})
|
|
save_json_file(runs, runs_file)
|
|
|
|
def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict):
|
|
"""
|
|
Compute metrics for both raw and calibrated scores, including stability tests,
|
|
normalized components, and detailed distributions.
|
|
|
|
Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
|
|
"""
|
|
run_data = runs[run_key]
|
|
results = run_data.get("results", {})
|
|
|
|
# 1. Collect raw scores, compute calibration, store calibrated values
|
|
raw_scores_by_model_all = defaultdict(list)
|
|
raw_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
|
|
calibrated_scores_by_model_all = defaultdict(list)
|
|
calibrated_scores_by_model_by_iter = defaultdict(lambda: defaultdict(list))
|
|
lengths_by_model = {}
|
|
|
|
# -- Collect raw scores
|
|
for model_name, iteration_data in results.items():
|
|
if not isinstance(iteration_data, dict):
|
|
continue
|
|
|
|
lengths = []
|
|
for it_key, it_val in iteration_data.items():
|
|
if it_key == "__model_stats__":
|
|
continue
|
|
if not isinstance(it_val, dict):
|
|
continue
|
|
|
|
for item_id, item_info in it_val.items():
|
|
if (isinstance(item_info, dict) and
|
|
"aggregated_score_raw" in item_info):
|
|
raw_score = item_info["aggregated_score_raw"]
|
|
|
|
# Collect raw score globally
|
|
raw_scores_by_model_all[model_name].append(raw_score)
|
|
# Collect raw score by iteration
|
|
raw_scores_by_model_by_iter[model_name][it_key].append(raw_score)
|
|
|
|
# Track text length for analyzing
|
|
text = (samples_data.get(model_name, {})
|
|
.get("samples", {})
|
|
.get(it_key, {})
|
|
.get(item_id, ""))
|
|
lengths.append(len(text))
|
|
|
|
if len(raw_scores_by_model_all[model_name]) > 0:
|
|
lengths_by_model[model_name] = lengths
|
|
|
|
# 2. Distribution + calibration
|
|
all_raw_scores = [s for scores in raw_scores_by_model_all.values() for s in scores]
|
|
run_data["raw_score_distribution"] = compute_detailed_distribution(all_raw_scores)
|
|
|
|
calibration_config = build_landmark_calibration_config(all_raw_scores, [0, 3, 5, 7, 10])
|
|
run_data["calibration_config"] = calibration_config
|
|
|
|
# Apply calibration
|
|
for model_name, iteration_data in results.items():
|
|
if not isinstance(iteration_data, dict):
|
|
continue
|
|
|
|
# Flatten model's raw scores, calibrate them
|
|
raw_list = raw_scores_by_model_all[model_name]
|
|
calibrated = [apply_landmark_calibration(s, calibration_config) for s in raw_list]
|
|
|
|
# Re-walk iteration_data to assign each calibration back
|
|
idx = 0
|
|
for it_key, it_val in iteration_data.items():
|
|
if it_key == "__model_stats__":
|
|
continue
|
|
if not isinstance(it_val, dict):
|
|
continue
|
|
for item_id, item_info in it_val.items():
|
|
if (isinstance(item_info, dict) and
|
|
"aggregated_score_raw" in item_info):
|
|
item_info["aggregated_score_calibrated"] = calibrated[idx]
|
|
idx += 1
|
|
|
|
# Update calibrated_scores_by_model_by_iter in the same breakdown
|
|
idx2 = 0
|
|
for it_key in raw_scores_by_model_by_iter[model_name]:
|
|
count_for_iter = len(raw_scores_by_model_by_iter[model_name][it_key])
|
|
these_cals = calibrated[idx2 : idx2 + count_for_iter]
|
|
calibrated_scores_by_model_by_iter[model_name][it_key].extend(these_cals)
|
|
idx2 += count_for_iter
|
|
|
|
# Populate the single flattened list of calibrated scores
|
|
calibrated_scores_by_model_all[model_name].extend(calibrated)
|
|
|
|
# 3. Calibrated distributions
|
|
all_calibrated_scores = [
|
|
s for scores in calibrated_scores_by_model_all.values() for s in scores
|
|
]
|
|
run_data["calibrated_score_distribution"] = compute_detailed_distribution(all_calibrated_scores)
|
|
|
|
# 4. Model-level stats
|
|
run_data["raw_model_stats"] = compute_model_level_stats(raw_scores_by_model_all, lengths_by_model)
|
|
run_data["calibrated_model_stats"] = compute_model_level_stats(calibrated_scores_by_model_all, lengths_by_model)
|
|
|
|
# 5. Cross-model stats
|
|
run_data["raw_cross_model_stats"] = compute_cross_model_stats(
|
|
scores_by_model_all=raw_scores_by_model_all,
|
|
scores_by_model_by_iter=raw_scores_by_model_by_iter
|
|
)
|
|
run_data["calibrated_cross_model_stats"] = compute_cross_model_stats(
|
|
scores_by_model_all=calibrated_scores_by_model_all,
|
|
scores_by_model_by_iter=calibrated_scores_by_model_by_iter
|
|
)
|
|
|
|
# 6. Separability metrics
|
|
compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
|
|
compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
|
|
|
|
|
|
# 8. Compute iteration stability for raw & calibrated
|
|
compute_iteration_stability(run_data, label="raw")
|
|
compute_iteration_stability(run_data, label="calibrated")
|
|
random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
|
|
random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
|
|
logging.info("Score stability (RAW)")
|
|
logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
|
|
logging.info("Score stability (CALIBRATED)")
|
|
logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
|
|
f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
|
|
|
|
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
|
|
|
|
# -- (A) RAW Judgemark
|
|
# Pull out raw stats + separability metrics
|
|
raw_stats = run_data["raw_cross_model_stats"]
|
|
raw_norm = raw_stats["normalized_components"] # "std_dev", "kw_stat", etc.
|
|
|
|
# Add your own normalization steps as needed
|
|
raw_emd = run_data["separability_metrics"]["raw"]["emd"]["average"]
|
|
raw_emd_norm = normalize(raw_emd, 0, 4)
|
|
raw_overlap_mag = run_data["separability_metrics"]["raw"]["ci99_overlap_magnitude_sum"]
|
|
raw_overlap_mag_norm = normalize(raw_overlap_mag, 0, 26, False)
|
|
raw_norm["ci99_overlap_magnitude_sum_norm"] = raw_overlap_mag_norm
|
|
|
|
# Range of raw model means
|
|
raw_score_range = (
|
|
max(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
|
|
- min(run_data["raw_model_stats"][model]["mean"] for model in run_data["raw_model_stats"])
|
|
)
|
|
run_data["raw_score_range"] = raw_score_range
|
|
raw_score_range_norm = normalize(raw_score_range, 0, 8)
|
|
raw_norm["raw_score_range_norm"] = raw_score_range_norm
|
|
|
|
# Add Kendall's tau from the randomization-based stability measure
|
|
raw_norm["kendall_tau_bootstrapped"] = normalize(random_tau_raw, 0.4, 1.0)
|
|
|
|
# compute an aggregated separability metric
|
|
raw_separability = (
|
|
raw_norm["std_dev"] # std deviation *between* models (separability)
|
|
+ raw_norm["kw_stat"] # kruskal-wallis (separability)
|
|
+ raw_norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
|
|
+ raw_norm["raw_score_range_norm"] # range of assigned scores (separability)
|
|
+ run_data["separability_metrics"]["raw"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
|
|
+ raw_emd_norm # earth-movers distance (separability)
|
|
) / 6.0
|
|
|
|
# Combine into final raw Judgemark
|
|
final_score_raw = (
|
|
raw_norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
|
|
+ raw_norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
|
|
+ 4 * raw_separability # aggregate of separability metrics
|
|
) / 6.0
|
|
run_data["final_judgemark_score_elements_raw"] = {
|
|
"norm_stability_between_iterations": raw_norm["kendall_tau_bootstrapped"],
|
|
"norm_correlation_with_lmsys_arena": raw_norm["kendall_tau"],
|
|
"norm_std_dev_between_models": raw_norm["std_dev"],
|
|
"norm_kruskall_wallis": raw_norm["kw_stat"],
|
|
"norm_ci99_adjacent_overlap": raw_norm["ci99_overlap_magnitude_sum_norm"],
|
|
"norm_score_range": raw_norm["raw_score_range_norm"],
|
|
"norm_intra_model_ci95": run_data["separability_metrics"]["raw"]["modulated_ci95"],
|
|
"norm_earth_movers_distance": raw_emd_norm
|
|
}
|
|
run_data["final_judgemark_score_raw"] = final_score_raw
|
|
|
|
# -- (B) Calibrated Judgemark
|
|
cal_stats = run_data["calibrated_cross_model_stats"]
|
|
norm = cal_stats["normalized_components"]
|
|
|
|
emd_norm = normalize(run_data["separability_metrics"]["calibrated"]["emd"]["average"], 0, 4)
|
|
overlap_magnitude_norm = normalize(
|
|
run_data["separability_metrics"]["calibrated"]["ci99_overlap_magnitude_sum"], 0, 26, False
|
|
)
|
|
norm["ci99_overlap_magnitude_sum_norm"] = overlap_magnitude_norm
|
|
|
|
# Range of calibrated model means
|
|
calibrated_score_range = (
|
|
max(run_data["calibrated_model_stats"][model]["mean"]
|
|
for model in run_data["calibrated_model_stats"])
|
|
- min(run_data["calibrated_model_stats"][model]["mean"]
|
|
for model in run_data["calibrated_model_stats"])
|
|
)
|
|
run_data["calibrated_score_range"] = calibrated_score_range
|
|
calibrated_score_range_norm = normalize(calibrated_score_range, 0, 8)
|
|
norm["calibrated_score_range_norm"] = calibrated_score_range_norm
|
|
|
|
# Kendall's tau from the randomized stability measure
|
|
norm["kendall_tau_bootstrapped"] = normalize(random_tau_cal, 0.4, 1.0)
|
|
|
|
# compute an aggregated separability metric
|
|
calibrated_separability = (
|
|
norm["std_dev"] # std deviation *between* models (separability)
|
|
+ norm["kw_stat"] # kruskal-wallis (separability)
|
|
+ norm["ci99_overlap_magnitude_sum_norm"] # confidence interval overlap between adjacently ranked models (separability)
|
|
+ norm["calibrated_score_range_norm"] # range of assigned scores (separability)
|
|
+ run_data["separability_metrics"]["calibrated"]["modulated_ci95"] # average ci95 per model scored (score stability + separability)
|
|
+ emd_norm # earth-movers distance (separability)
|
|
) / 6.0
|
|
|
|
final_score_calibrated = (
|
|
norm["kendall_tau_bootstrapped"] # correlation between iterations (ranking stability)
|
|
+ norm["kendall_tau"] # correlation with lmsys arena score (corr to human pref)
|
|
+ 4 * calibrated_separability # aggregate of separability metrics
|
|
) / 6.0
|
|
run_data["final_judgemark_score_elements_calibrated"] = {
|
|
"norm_stability_between_iterations": norm["kendall_tau_bootstrapped"],
|
|
"norm_correlation_with_lmsys_arena": norm["kendall_tau"],
|
|
"norm_std_dev_between_models": norm["std_dev"],
|
|
"norm_kruskall_wallis": norm["kw_stat"],
|
|
"norm_ci99_adjacent_overlap": norm["ci99_overlap_magnitude_sum_norm"],
|
|
"norm_score_range": norm["calibrated_score_range_norm"],
|
|
"norm_intra_model_ci95": run_data["separability_metrics"]["calibrated"]["modulated_ci95"],
|
|
"norm_earth_movers_distance": norm
|
|
}
|
|
run_data["final_judgemark_score"] = final_score_calibrated
|
|
|
|
# 10. Create visualizations + logs
|
|
create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data)
|
|
|
|
log_score_summary(
|
|
"RAW SCORES",
|
|
run_data["raw_cross_model_stats"],
|
|
run_data["raw_model_stats"]
|
|
)
|
|
log_score_summary(
|
|
"CALIBRATED SCORES",
|
|
run_data["calibrated_cross_model_stats"],
|
|
run_data["calibrated_model_stats"]
|
|
)
|
|
|
|
logging.info(f"Final Judgemark (raw) = {final_score_raw:.3f}")
|
|
logging.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}")
|
|
|
|
|
|
def sanitize_model_name(name: str) -> str:
|
|
"""Sanitize judge model name for use in the run key."""
|
|
return re.sub(r'[^a-zA-Z0-9_-]+', '_', name)
|
|
|
|
def run_judgemark_v2(
|
|
judge_model: str,
|
|
samples_file: str,
|
|
prompts_file: str,
|
|
runs_file: str,
|
|
num_threads: int,
|
|
run_id: str = None,
|
|
save_raw_judge_output: bool = False
|
|
) -> str:
|
|
global executor, should_exit
|
|
|
|
logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
|
|
runs = load_json_file(runs_file)
|
|
|
|
# Form the run key using run_id + "__" + sanitized judge model
|
|
sanitized_jm = sanitize_model_name(judge_model)
|
|
base_id = run_id if run_id else str(uuid.uuid4())
|
|
run_key = f"{base_id}__{sanitized_jm}"
|
|
|
|
# Load data files
|
|
samples_data = load_json_file(samples_file)
|
|
judge_prompts = load_json_file(prompts_file)
|
|
|
|
# Initialize or get existing run data
|
|
if run_key not in runs:
|
|
runs[run_key] = {
|
|
"judge_model": judge_model,
|
|
"start_time": datetime.now().isoformat(),
|
|
"status": "running",
|
|
"samples_file": samples_file,
|
|
"prompts_file": prompts_file,
|
|
"results": {}
|
|
}
|
|
save_json_file(runs, runs_file)
|
|
|
|
run_data = runs[run_key]
|
|
items_to_process = []
|
|
|
|
# If run exists, scan for items needing retry
|
|
if "results" in run_data:
|
|
results = run_data.get("results", {})
|
|
|
|
# Scan all possible items
|
|
for model_name, model_info in samples_data.items():
|
|
samples_dict = model_info.get("samples", {})
|
|
for iteration_key, iteration_items in samples_dict.items():
|
|
for item_id, item_text in iteration_items.items():
|
|
# Check if this item needs processing
|
|
existing_result = (results.get(model_name, {})
|
|
.get(iteration_key, {})
|
|
.get(item_id, {}))
|
|
|
|
needs_retry = (
|
|
not existing_result or
|
|
not existing_result.get("parsed_scores") or
|
|
len(existing_result.get("parsed_scores", {})) < 10 or
|
|
existing_result.get("aggregated_score_raw", 0.0) == 0.0 or
|
|
"error" in existing_result
|
|
)
|
|
|
|
if needs_retry:
|
|
items_to_process.append({
|
|
"model_name": model_name,
|
|
"iteration_key": iteration_key,
|
|
"item_id": item_id,
|
|
"item_text": item_text,
|
|
"prompt_template": judge_prompts.get(item_id, "")
|
|
})
|
|
|
|
if items_to_process:
|
|
logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
|
|
else:
|
|
logging.info(f"No items to process in existing run {run_key}")
|
|
|
|
else:
|
|
# New run - process all items
|
|
for model_name, model_info in samples_data.items():
|
|
samples_dict = model_info.get("samples", {})
|
|
for iteration_key, iteration_items in samples_dict.items():
|
|
print(iteration_key)
|
|
for item_id, item_text in iteration_items.items():
|
|
items_to_process.append({
|
|
"model_name": model_name,
|
|
"iteration_key": iteration_key,
|
|
"item_id": item_id,
|
|
"item_text": item_text,
|
|
"prompt_template": judge_prompts.get(item_id, "")
|
|
})
|
|
|
|
# Ensure concurrency lock
|
|
lock = threading.Lock()
|
|
|
|
# Process any items that need retrying
|
|
all_futures = []
|
|
try:
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
|
|
executor = exec_
|
|
|
|
if items_to_process:
|
|
# Process all items (either retries or new run)
|
|
for item in items_to_process:
|
|
if should_exit:
|
|
break
|
|
|
|
all_futures.append(
|
|
executor.submit(
|
|
process_sample,
|
|
item["model_name"],
|
|
item["iteration_key"],
|
|
item["item_id"],
|
|
item["item_text"],
|
|
item["prompt_template"],
|
|
run_key,
|
|
runs,
|
|
runs_file,
|
|
lock,
|
|
judge_model,
|
|
save_raw_judge_output
|
|
)
|
|
)
|
|
|
|
# Display progress bar for tasks
|
|
for f in tqdm(concurrent.futures.as_completed(all_futures),
|
|
total=len(all_futures), desc="Judging", leave=True):
|
|
if should_exit:
|
|
break
|
|
try:
|
|
f.result()
|
|
except Exception as exc:
|
|
logging.error(f"Exception in worker thread: {exc}")
|
|
|
|
except KeyboardInterrupt:
|
|
logging.warning("KeyboardInterrupt caught in main thread.")
|
|
should_exit = True
|
|
time.sleep(0.1)
|
|
finally:
|
|
# Mark run as interrupted or completed
|
|
status = "interrupted" if should_exit else "completed"
|
|
runs[run_key]["status"] = status
|
|
runs[run_key]["end_time"] = datetime.now().isoformat()
|
|
|
|
if not should_exit:
|
|
# Run stability test
|
|
if False:
|
|
run_stability_test(
|
|
run_data, judge_model,
|
|
judge_prompts, samples_data,
|
|
runs, runs_file,
|
|
lock, num_threads
|
|
)
|
|
# Compute final stats
|
|
finalize_scores_and_compute_judgemark(runs, run_key, samples_data)
|
|
|
|
# Save final
|
|
save_json_file(runs, runs_file)
|
|
|
|
if executor:
|
|
logging.info("Shutting down executor")
|
|
executor.shutdown(wait=False)
|
|
executor = None
|
|
|
|
logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
|
|
return run_key |