This commit is contained in:
wassname
2025-07-26 08:54:38 +08:00
parent 6b13ac014b
commit f77bd9f2fb
5 changed files with 333 additions and 193 deletions
+14 -12
View File
@@ -111,13 +111,14 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text:
})
save_json_file(runs, runs_file)
def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict, score_key="aggregated_score_raw"):
def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict, score_key="aggregated_score_raw", do_plot: int = 2, verbosity: int = 1):
"""
Compute metrics for both raw and calibrated scores, including stability tests,
normalized components, and detailed distributions.
Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
"""
s = ""
run_data = runs[run_key]
results = run_data.get("results", {})
@@ -221,20 +222,21 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
)
# 6. Separability metrics
compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
s1, _ = compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
s2, _ = compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
s += s1
s += s2
# 8. Compute iteration stability for raw & calibrated
compute_iteration_stability(run_data, label="raw")
compute_iteration_stability(run_data, label="calibrated")
random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
logger.info("Score stability (RAW)")
logger.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
logger.info("Score stability (CALIBRATED)")
logger.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
s += "Score stability (RAW)\n"
s += f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}\n"
s += "Score stability (CALIBRATED)\n"
s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
s += f"({run_data['calibrated_cross_model_stats']['kendall_tau']})\n"
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
@@ -351,14 +353,14 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
run_data["final_judgemark_score"] = final_score_calibrated
# 10. Create visualizations + logs
create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data, method=score_key[:3])
create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data, method=score_key[:3], do_plot=do_plot)
log_score_summary(
s += log_score_summary(
"RAW SCORES",
run_data["raw_cross_model_stats"],
run_data["raw_model_stats"]
)
log_score_summary(
s += log_score_summary(
"CALIBRATED SCORES",
run_data["calibrated_cross_model_stats"],
run_data["calibrated_model_stats"]
+13 -10
View File
@@ -302,14 +302,15 @@ def apply_landmark_calibration(x, config):
def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
"""Log a readable summary of score statistics."""
logger.info(f"\n------- {score_type} Summary -------")
logger.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
logger.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
logger.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
logger.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
logger.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
logger.info("\nModel Scores:")
s = ""
s += f"\n------- {score_type} Summary -------"
s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}"
s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}"
s += f"Pearson r={cross_stats['pearson_r']:.4f}"
s += f"Kendall τ={cross_stats['kendall_tau']:.4f}"
s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}"
s += "\nModel Scores:"
sorted_models = sorted(
model_stats.items(),
key=lambda kv: kv[1]["mean"],
@@ -317,8 +318,10 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
)
for model, stats in sorted_models:
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
logger.info(line)
logger.info("------------------------------------")
s += line
s += "\n------------------------------------"
logger.info(s)
return s
def compute_weighted_score(logp):
+12 -10
View File
@@ -296,13 +296,15 @@ def compute_separability_metrics(
metrics_label["modulated_ci95"] = modulated_ci95
# Logging summary
logger.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
logger.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
logger.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
f"{sum_overlap_magnitude:.3f}")
logger.info(f"CI99 Overlap pct: "
f"{ci99_overlap_percentage_adjacent_avg:.3f}")
logger.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
logger.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
logger.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
s = ""
s += f"\n--- {label.upper()} SEPARABILITY METRICS ---\n"
s += f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}\n"
s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
s += f"{sum_overlap_magnitude:.3f}\n"
s += f"CI99 Overlap pct: "
s += f"{ci99_overlap_percentage_adjacent_avg:.3f}\n"
s += f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}\n"
s += f"Average EMD across all pairs: {emd_data['average']:.3f}\n"
s += f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})\n"
logger.info(s)
return s, metrics_label
+7 -3
View File
@@ -8,7 +8,7 @@ from scipy.stats import spearmanr, theilslopes
from judgemark_v2lp.config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict, method: str = "raw"):
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict, method: str = "raw", do_plot: int = 1):
"""
Produces two figures:
• Figure #1 with three subplots side-by-side:
@@ -86,6 +86,8 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
heatmap_data = np.array(heatmap_rows, dtype=float)
# 1.B) Plot the main figure with 3 subplots
if do_plot < 1:
return
fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7)) # Increased height
# Increase the base font size for all text elements
@@ -139,7 +141,7 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
# Overall title
sanitized_judge = re.sub(r"[^\w\-]", "-", judge_model.replace("/", "__"))
fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}", fontsize=20)
fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}. {method}", fontsize=20)
# Adjust layout with more space
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
@@ -169,8 +171,10 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
model_list_for_scatter = model_list_for_scatter[:16]
# Build figure and subplots: 4x4
if do_plot<1:
return
fig2, axes2 = plt.subplots(4, 4, figsize=(20, 20)) # each cell is a scatter
fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}", fontsize=18)
fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}. {method}", fontsize=18)
# We might have fewer than 16 models. We'll track them by row & col.
for idx, mname in enumerate(model_list_for_scatter):
+287 -158
View File
File diff suppressed because one or more lines are too long