mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
wip
This commit is contained in:
+14
-12
@@ -111,13 +111,14 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text:
|
||||
})
|
||||
save_json_file(runs, runs_file)
|
||||
|
||||
def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict, score_key="aggregated_score_raw"):
|
||||
def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data: dict, score_key="aggregated_score_raw", do_plot: int = 2, verbosity: int = 1):
|
||||
"""
|
||||
Compute metrics for both raw and calibrated scores, including stability tests,
|
||||
normalized components, and detailed distributions.
|
||||
|
||||
Now also returns a final_judgemark_score for BOTH raw and calibrated statistics.
|
||||
"""
|
||||
s = ""
|
||||
run_data = runs[run_key]
|
||||
results = run_data.get("results", {})
|
||||
|
||||
@@ -221,20 +222,21 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
|
||||
)
|
||||
|
||||
# 6. Separability metrics
|
||||
compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
|
||||
compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
|
||||
s1, _ = compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
|
||||
s2, _ = compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
|
||||
s += s1
|
||||
s += s2
|
||||
|
||||
|
||||
# 8. Compute iteration stability for raw & calibrated
|
||||
compute_iteration_stability(run_data, label="raw")
|
||||
compute_iteration_stability(run_data, label="calibrated")
|
||||
random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
|
||||
random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
|
||||
logger.info("Score stability (RAW)")
|
||||
logger.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
|
||||
logger.info("Score stability (CALIBRATED)")
|
||||
logger.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
|
||||
f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
|
||||
s += "Score stability (RAW)\n"
|
||||
s += f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}\n"
|
||||
s += "Score stability (CALIBRATED)\n"
|
||||
s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
|
||||
s += f"({run_data['calibrated_cross_model_stats']['kendall_tau']})\n"
|
||||
|
||||
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
|
||||
|
||||
@@ -351,14 +353,14 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
|
||||
run_data["final_judgemark_score"] = final_score_calibrated
|
||||
|
||||
# 10. Create visualizations + logs
|
||||
create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data, method=score_key[:3])
|
||||
create_side_by_side_score_charts(run_data, run_data["judge_model"], samples_data, method=score_key[:3], do_plot=do_plot)
|
||||
|
||||
log_score_summary(
|
||||
s += log_score_summary(
|
||||
"RAW SCORES",
|
||||
run_data["raw_cross_model_stats"],
|
||||
run_data["raw_model_stats"]
|
||||
)
|
||||
log_score_summary(
|
||||
s += log_score_summary(
|
||||
"CALIBRATED SCORES",
|
||||
run_data["calibrated_cross_model_stats"],
|
||||
run_data["calibrated_model_stats"]
|
||||
|
||||
+13
-10
@@ -302,14 +302,15 @@ def apply_landmark_calibration(x, config):
|
||||
|
||||
def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
|
||||
"""Log a readable summary of score statistics."""
|
||||
logger.info(f"\n------- {score_type} Summary -------")
|
||||
logger.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
|
||||
logger.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
|
||||
logger.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
|
||||
logger.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
|
||||
logger.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
|
||||
|
||||
logger.info("\nModel Scores:")
|
||||
s = ""
|
||||
s += f"\n------- {score_type} Summary -------"
|
||||
s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}"
|
||||
s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}"
|
||||
s += f"Pearson r={cross_stats['pearson_r']:.4f}"
|
||||
s += f"Kendall τ={cross_stats['kendall_tau']:.4f}"
|
||||
s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}"
|
||||
|
||||
s += "\nModel Scores:"
|
||||
sorted_models = sorted(
|
||||
model_stats.items(),
|
||||
key=lambda kv: kv[1]["mean"],
|
||||
@@ -317,8 +318,10 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
|
||||
)
|
||||
for model, stats in sorted_models:
|
||||
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
|
||||
logger.info(line)
|
||||
logger.info("------------------------------------")
|
||||
s += line
|
||||
s += "\n------------------------------------"
|
||||
logger.info(s)
|
||||
return s
|
||||
|
||||
|
||||
def compute_weighted_score(logp):
|
||||
|
||||
@@ -296,13 +296,15 @@ def compute_separability_metrics(
|
||||
metrics_label["modulated_ci95"] = modulated_ci95
|
||||
|
||||
# Logging summary
|
||||
logger.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
|
||||
logger.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
|
||||
logger.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
|
||||
f"{sum_overlap_magnitude:.3f}")
|
||||
logger.info(f"CI99 Overlap pct: "
|
||||
f"{ci99_overlap_percentage_adjacent_avg:.3f}")
|
||||
|
||||
logger.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
|
||||
logger.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
|
||||
logger.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
|
||||
s = ""
|
||||
s += f"\n--- {label.upper()} SEPARABILITY METRICS ---\n"
|
||||
s += f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}\n"
|
||||
s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
|
||||
s += f"{sum_overlap_magnitude:.3f}\n"
|
||||
s += f"CI99 Overlap pct: "
|
||||
s += f"{ci99_overlap_percentage_adjacent_avg:.3f}\n"
|
||||
s += f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}\n"
|
||||
s += f"Average EMD across all pairs: {emd_data['average']:.3f}\n"
|
||||
s += f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})\n"
|
||||
logger.info(s)
|
||||
return s, metrics_label
|
||||
|
||||
@@ -8,7 +8,7 @@ from scipy.stats import spearmanr, theilslopes
|
||||
from judgemark_v2lp.config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
|
||||
|
||||
|
||||
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict, method: str = "raw"):
|
||||
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict, method: str = "raw", do_plot: int = 1):
|
||||
"""
|
||||
Produces two figures:
|
||||
• Figure #1 with three subplots side-by-side:
|
||||
@@ -86,6 +86,8 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
|
||||
heatmap_data = np.array(heatmap_rows, dtype=float)
|
||||
|
||||
# 1.B) Plot the main figure with 3 subplots
|
||||
if do_plot < 1:
|
||||
return
|
||||
fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7)) # Increased height
|
||||
|
||||
# Increase the base font size for all text elements
|
||||
@@ -139,7 +141,7 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
|
||||
# Overall title
|
||||
|
||||
sanitized_judge = re.sub(r"[^\w\-]", "-", judge_model.replace("/", "__"))
|
||||
fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}", fontsize=20)
|
||||
fig1.suptitle(f"Judgemark: Raw/Calibrated/Heatmap - Judge: {judge_model}. {method}", fontsize=20)
|
||||
|
||||
# Adjust layout with more space
|
||||
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
|
||||
@@ -169,8 +171,10 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
|
||||
model_list_for_scatter = model_list_for_scatter[:16]
|
||||
|
||||
# Build figure and subplots: 4x4
|
||||
if do_plot<1:
|
||||
return
|
||||
fig2, axes2 = plt.subplots(4, 4, figsize=(20, 20)) # each cell is a scatter
|
||||
fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}", fontsize=18)
|
||||
fig2.suptitle(f"Judgemark: Per-Model Length vs. Score - Judge: {judge_model}. {method}", fontsize=18)
|
||||
|
||||
# We might have fewer than 16 models. We'll track them by row & col.
|
||||
for idx, mname in enumerate(model_list_for_scatter):
|
||||
|
||||
+287
-158
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user