mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
deepseek-r1
This commit is contained in:
@@ -9,17 +9,29 @@ This project compares different methods of extracting scores from language model
|
||||
- **Raw**: Traditional text-based scoring (baseline Judgemark method)
|
||||
- **Weighted**: Log probability weighting using normalized choice probabilities (similar to [G-Eval](https://arxiv.org/abs/2303.16634))
|
||||
- **Ranked**: Ranking-based evaluation using Kendall's tau correlation between log probability rankings and score rankings
|
||||
|
||||
****
|
||||
## Results
|
||||
|
||||
| name | judgemark_score_calib | stability_calib | separability_calib | human_correlation_calib |
|
||||
| ------------- | --------------------- | --------------- | ------------------ | ----------------------- |
|
||||
| ranked_scaled | **0.788** | **1.0** | **0.785** | 0.592 |
|
||||
| published | 0.761 | 0.894 | 0.691 | **0.908** |
|
||||
| ranked | 0.74 | 0.895 | 0.665 | 0.882 |
|
||||
| raw | 0.731 | 0.895 | 0.653 | 0.882 |
|
||||
| weighted | 0.716 | 0.886 | 0.633 | 0.876 |
|
||||
| ranked_norm | 0.575 | 0.644 | 0.506 | 0.781 |
|
||||
| weighted_norm | 0.545 | 0.547 | 0.49 | 0.761 |
|
||||
|
||||
| Method | Score | Score (Normalized) |
|
||||
|---------------|----------|------------|
|
||||
| ranked_scaled | 0.62 | 0.80 |
|
||||
| ranked_norm | 0.65 | 0.74 |
|
||||
| weighted | 0.63 | 0.65 |
|
||||
| raw (baseline)| 0.63 | 0.65 |
|
||||
| weighted_norm | 0.62 | 0.64 |
|
||||
*results for DeepSeek R1**
|
||||
|
||||
|
||||
| Method | Score | Score (Normalized) |
|
||||
| -------------- | ----- | ------------------ |
|
||||
| ranked_scaled | 0.62 | 0.80 |
|
||||
| ranked_norm | 0.65 | 0.74 |
|
||||
| weighted | 0.63 | 0.65 |
|
||||
| raw (baseline) | 0.63 | 0.65 |
|
||||
| weighted_norm | 0.62 | 0.64 |
|
||||
|
||||
*Results for DeepSeek Chat V3 0324*
|
||||
|
||||
@@ -92,7 +104,16 @@ uv run python judgemark_v2.py \
|
||||
--save-raw-judge-output
|
||||
|
||||
uv run python judgemark_v2.py \
|
||||
--judge-model "deepseek/deepseek-chat-v3-0324" \
|
||||
--judge-model "qwen/qwen3-235b-a22b" \
|
||||
--samples-file data/judgemark_v2.1_samples.json \
|
||||
--prompts-file data/judge_prompts.json \
|
||||
--runs-file outputs/my_judgemark_runs2.json \
|
||||
--num-runs 1 \
|
||||
--save-raw-judge-output \
|
||||
--threads 8
|
||||
|
||||
uv run python judgemark_v2.py \
|
||||
--judge-model "deepseek/deepseek-r1" \
|
||||
--samples-file data/judgemark_v2.1_samples.json \
|
||||
--prompts-file data/judge_prompts.json \
|
||||
--runs-file outputs/my_judgemark_runs2.json \
|
||||
|
||||
+19
-10
@@ -224,8 +224,8 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
|
||||
# 6. Separability metrics
|
||||
s1, _ = compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
|
||||
s2, _ = compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
|
||||
s += s1
|
||||
s += s2
|
||||
s += s1 + "\n"
|
||||
s += s2 + "\n"
|
||||
|
||||
# 8. Compute iteration stability for raw & calibrated
|
||||
compute_iteration_stability(run_data, label="raw")
|
||||
@@ -235,7 +235,7 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
|
||||
s += "Score stability (RAW)\n"
|
||||
s += f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}\n"
|
||||
s += "Score stability (CALIBRATED)\n"
|
||||
s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
|
||||
s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} \n"
|
||||
s += f"({run_data['calibrated_cross_model_stats']['kendall_tau']})\n"
|
||||
|
||||
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
|
||||
@@ -359,18 +359,27 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
|
||||
"RAW SCORES",
|
||||
run_data["raw_cross_model_stats"],
|
||||
run_data["raw_model_stats"]
|
||||
)
|
||||
) + "\n"
|
||||
s += log_score_summary(
|
||||
"CALIBRATED SCORES",
|
||||
run_data["calibrated_cross_model_stats"],
|
||||
run_data["calibrated_model_stats"]
|
||||
)
|
||||
|
||||
logger.info(f"Final Judgemark (raw) = {final_score_raw:.3f}")
|
||||
logger.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}")
|
||||
) + "\n"
|
||||
s += f"Final Judgemark (raw) = {final_score_raw:.3f}\n"
|
||||
s += f"Final Judgemark (cal) = {final_score_calibrated:.3f}\n"
|
||||
logger.info(s)
|
||||
return {
|
||||
"final_judgemark_score_raw": final_score_raw,
|
||||
"final_judgemark_score_calibrated": final_score_calibrated,
|
||||
"judgemark_score": final_score_raw,
|
||||
"judgemark_score_calib": final_score_calibrated,
|
||||
|
||||
"stability": raw_norm["kendall_tau_bootstrapped"],
|
||||
"stability_calib": norm["kendall_tau_bootstrapped"],
|
||||
|
||||
"separability": raw_separability,
|
||||
"separability_calib": calibrated_separability,
|
||||
|
||||
"human_correlation": raw_norm["kendall_tau"],
|
||||
"human_correlation_calib": norm["kendall_tau"],
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -12,8 +12,9 @@ STABILITY_REPS = 100
|
||||
# note the rererence scores are https://old.reddit.com/r/LocalLLaMA/comments/1cd2jco/judgemark_how_well_a_llm_judge_can_evaluate/
|
||||
# correlation with arena elo https://huggingface.co/spaces/lmarena-ai/chatbot-arena-leaderboard
|
||||
REFERENCE_MODEL_SCORES = {
|
||||
# "DeepSeek-R1": 1357,
|
||||
"DeepSeek-R1": 1430, # !! need to update this
|
||||
"gpt-4o-2024-11-20": 1402,
|
||||
"DeepSeek-R1": 1357,
|
||||
"gemini-1.5-pro-002": 1333,
|
||||
"gemini-1.5-pro-001": 1276,
|
||||
"claude-3-5-sonnet-20240620": 1243,
|
||||
|
||||
+10
-10
@@ -302,24 +302,24 @@ def apply_landmark_calibration(x, config):
|
||||
|
||||
def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
|
||||
"""Log a readable summary of score statistics."""
|
||||
s = ""
|
||||
s += f"\n------- {score_type} Summary -------"
|
||||
s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}"
|
||||
s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}"
|
||||
s += f"Pearson r={cross_stats['pearson_r']:.4f}"
|
||||
s += f"Kendall τ={cross_stats['kendall_tau']:.4f}"
|
||||
s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}"
|
||||
s = "\n\n"
|
||||
s += f"\n------- {score_type} Summary -------\n"
|
||||
s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}\n"
|
||||
s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}\n"
|
||||
s += f"Pearson r={cross_stats['pearson_r']:.4f}\n"
|
||||
s += f"Kendall τ={cross_stats['kendall_tau']:.4f}\n"
|
||||
s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}\n"
|
||||
|
||||
s += "\nModel Scores:"
|
||||
s += "\nModel Scores:\n"
|
||||
sorted_models = sorted(
|
||||
model_stats.items(),
|
||||
key=lambda kv: kv[1]["mean"],
|
||||
reverse=True
|
||||
)
|
||||
for model, stats in sorted_models:
|
||||
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
|
||||
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}\n"
|
||||
s += line
|
||||
s += "\n------------------------------------"
|
||||
s += "\n------------------------------------\n"
|
||||
logger.info(s)
|
||||
return s
|
||||
|
||||
|
||||
@@ -296,12 +296,12 @@ def compute_separability_metrics(
|
||||
metrics_label["modulated_ci95"] = modulated_ci95
|
||||
|
||||
# Logging summary
|
||||
s = ""
|
||||
s = "\n\n"
|
||||
s += f"\n--- {label.upper()} SEPARABILITY METRICS ---\n"
|
||||
s += f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}\n"
|
||||
s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
|
||||
s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): \n"
|
||||
s += f"{sum_overlap_magnitude:.3f}\n"
|
||||
s += f"CI99 Overlap pct: "
|
||||
s += f"CI99 Overlap pct: \n"
|
||||
s += f"{ci99_overlap_percentage_adjacent_avg:.3f}\n"
|
||||
s += f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}\n"
|
||||
s += f"Average EMD across all pairs: {emd_data['average']:.3f}\n"
|
||||
|
||||
@@ -38,8 +38,9 @@ def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int
|
||||
|
||||
## openrouter specific
|
||||
"provider": {
|
||||
"require_parameters": True,
|
||||
"require_parameters": True,
|
||||
"order": [
|
||||
"Cerebras",
|
||||
"OpenAI",
|
||||
"Lambda",
|
||||
"DeepSeek",
|
||||
|
||||
+1239
-223
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user