wip

2026-06-27 16:10:14 +08:00 · 2025-07-26 08:54:38 +08:00
parent 6b13ac014b
commit f77bd9f2fb
5 changed files with 333 additions and 193 deletions
@@ -111,13 +111,14 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text:
            })
            save_json_file(runs, runs_file)

-def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict, score_key="aggregated_score_raw"):
+def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict, score_key="aggregated_score_raw", do_plot: int = 2, verbosity: int = 1):
    """
    Compute metrics for both raw and calibrated scores, including stability tests,
    normalized components, and detailed distributions.
    
    Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
    """
+    s = ""
    run_data = runs[run_key]
    results = run_data.get("results", {})

@@ -221,20 +222,21 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
    )

    # 6. Separability metrics
-    compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
-    compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
+    s1, _ = compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
+    s2, _ = compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
+    s += s1
+    s += s2

-    
    # 8. Compute iteration stability for raw & calibrated
    compute_iteration_stability(run_data, label="raw")  
    compute_iteration_stability(run_data, label="calibrated")
    random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
    random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
-    logger.info("Score stability (RAW)")
-    logger.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
-    logger.info("Score stability (CALIBRATED)") 
-    logger.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
-                 f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
+    s += "Score stability (RAW)\n"
+    s += f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}\n"
+    s += "Score stability (CALIBRATED)\n"
+    s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
+    s += f"({run_data['calibrated_cross_model_stats']['kendall_tau']})\n"

    # 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)

@@ -351,14 +353,14 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
    run_data["final_judgemark_score"] = final_score_calibrated

    # 10. Create visualizations + logs
-    create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data, method=score_key[:3])
+    create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data, method=score_key[:3], do_plot=do_plot)
    
-    log_score_summary(
+    s += log_score_summary(
        "RAW SCORES", 
        run_data["raw_cross_model_stats"], 
        run_data["raw_model_stats"]
    )
-    log_score_summary(
+    s += log_score_summary(
        "CALIBRATED SCORES", 
        run_data["calibrated_cross_model_stats"],
        run_data["calibrated_model_stats"]
@@ -302,14 +302,15 @@ def apply_landmark_calibration(x, config):

 def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
    """Log a readable summary of score statistics."""
-    logger.info(f"\n------- {score_type} Summary -------")
-    logger.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
-    logger.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
-    logger.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
-    logger.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
-    logger.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
-    
-    logger.info("\nModel Scores:")
+    s = ""
+    s += f"\n------- {score_type} Summary -------"
+    s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}"
+    s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}"
+    s += f"Pearson r={cross_stats['pearson_r']:.4f}"
+    s += f"Kendall τ={cross_stats['kendall_tau']:.4f}"
+    s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}"
+
+    s += "\nModel Scores:"
    sorted_models = sorted(
        model_stats.items(),
        key=lambda kv: kv[1]["mean"],
@@ -317,8 +318,10 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
    )
    for model, stats in sorted_models:
        line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
-        logger.info(line)
-    logger.info("------------------------------------")
+        s += line
+    s += "\n------------------------------------"
+    logger.info(s)
+    return s


 def compute_weighted_score(logp):
@@ -296,13 +296,15 @@ def compute_separability_metrics(
    metrics_label["modulated_ci95"] = modulated_ci95    

    # Logging summary
-    logger.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
-    logger.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
-    logger.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
-                 f"{sum_overlap_magnitude:.3f}")
-    logger.info(f"CI99 Overlap pct: "
-                 f"{ci99_overlap_percentage_adjacent_avg:.3f}")
-    
-    logger.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
-    logger.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
-    logger.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
+    s = ""
+    s += f"\n--- {label.upper()} SEPARABILITY METRICS ---\n"
+    s += f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}\n"
+    s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
+    s += f"{sum_overlap_magnitude:.3f}\n"
+    s += f"CI99 Overlap pct: "
+    s += f"{ci99_overlap_percentage_adjacent_avg:.3f}\n"
+    s += f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}\n"
+    s += f"Average EMD across all pairs: {emd_data['average']:.3f}\n"
+    s += f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})\n"
+    logger.info(s)
+    return s, metrics_label
@@ -8,7 +8,7 @@ from scipy.stats import spearmanr, theilslopes
 from judgemark_v2lp.config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS


-def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict, method: str = "raw"):
+def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict, method: str = "raw", do_plot: int = 1):
    """
    Produces two figures:
      • Figure #1 with three subplots side-by-side:
@@ -86,6 +86,8 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
    heatmap_data = np.array(heatmap_rows, dtype=float)
    
    # 1.B) Plot the main figure with 3 subplots
+    if do_plot < 1:
+        return
    fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7))  # Increased height

    # Increase the base font size for all text elements
@@ -139,7 +141,7 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
    # Overall title
    
    sanitized_judge = re.sub(r"[^\w\-]", "-", judge_model.replace("/", "__"))
-    fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}", fontsize=20)
+    fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}. {method}", fontsize=20)
    
    # Adjust layout with more space
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
@@ -169,8 +171,10 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
        model_list_for_scatter = model_list_for_scatter[:16]

    # Build figure and subplots: 4x4
+    if do_plot<1:
+        return
    fig2, axes2 = plt.subplots(4, 4, figsize=(20, 20))  # each cell is a scatter
-    fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}", fontsize=18)
+    fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}. {method}", fontsize=18)
    
    # We might have fewer than 16 models. We'll track them by row & col.
    for idx, mname in enumerate(model_list_for_scatter):