deepseek-r1

2026-06-27 16:10:14 +08:00 · 2025-07-28 18:34:04 +08:00
parent 1c0cce874d
commit e0a45b36df
7 changed files with 1305 additions and 257 deletions
@@ -9,17 +9,29 @@ This project compares different methods of extracting scores from language model
 - **Raw**: Traditional text-based scoring (baseline Judgemark method)
 - **Weighted**: Log probability weighting using normalized choice probabilities (similar to [G-Eval](https://arxiv.org/abs/2303.16634))
 - **Ranked**: Ranking-based evaluation using Kendall's tau correlation between log probability rankings and score rankings
-
+****
 ## Results

+| name          | judgemark_score_calib | stability_calib | separability_calib | human_correlation_calib |
+| ------------- | --------------------- | --------------- | ------------------ | ----------------------- |
+| ranked_scaled | **0.788**             | **1.0**         | **0.785**          | 0.592                   |
+| published     | 0.761                 | 0.894           | 0.691              | **0.908**               |
+| ranked        | 0.74                  | 0.895           | 0.665              | 0.882                   |
+| raw           | 0.731                 | 0.895           | 0.653              | 0.882                   |
+| weighted      | 0.716                 | 0.886           | 0.633              | 0.876                   |
+| ranked_norm   | 0.575                 | 0.644           | 0.506              | 0.781                   |
+| weighted_norm | 0.545                 | 0.547           | 0.49               | 0.761                   |

-| Method        | Score    | Score (Normalized) |
-|---------------|----------|------------|
-| ranked_scaled |     0.62 |       0.80 |
-| ranked_norm   |     0.65 |       0.74 |
-| weighted      |     0.63 |       0.65 |
-| raw (baseline)|     0.63 |       0.65 |
-| weighted_norm |     0.62 |       0.64 |
+*results for DeepSeek R1**
+
+
+| Method         | Score | Score (Normalized) |
+| -------------- | ----- | ------------------ |
+| ranked_scaled  | 0.62  | 0.80               |
+| ranked_norm    | 0.65  | 0.74               |
+| weighted       | 0.63  | 0.65               |
+| raw (baseline) | 0.63  | 0.65               |
+| weighted_norm  | 0.62  | 0.64               |

 *Results for DeepSeek Chat V3 0324*

@@ -92,7 +104,16 @@ uv run python judgemark_v2.py \
  --save-raw-judge-output

 uv run python judgemark_v2.py \
-  --judge-model "deepseek/deepseek-chat-v3-0324" \
+  --judge-model "qwen/qwen3-235b-a22b" \
+  --samples-file data/judgemark_v2.1_samples.json \
+  --prompts-file data/judge_prompts.json \
+  --runs-file outputs/my_judgemark_runs2.json \
+  --num-runs 1 \
+  --save-raw-judge-output \
+  --threads 8
+
+uv run python judgemark_v2.py \
+  --judge-model "deepseek/deepseek-r1" \
  --samples-file data/judgemark_v2.1_samples.json \
  --prompts-file data/judge_prompts.json \
  --runs-file outputs/my_judgemark_runs2.json \
@@ -224,8 +224,8 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
    # 6. Separability metrics
    s1, _ = compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
    s2, _ = compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
-    s += s1
-    s += s2
+    s += s1 + "\n"
+    s += s2 + "\n"

    # 8. Compute iteration stability for raw & calibrated
    compute_iteration_stability(run_data, label="raw")  
@@ -235,7 +235,7 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
    s += "Score stability (RAW)\n"
    s += f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}\n"
    s += "Score stability (CALIBRATED)\n"
-    s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
+    s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} \n"
    s += f"({run_data['calibrated_cross_model_stats']['kendall_tau']})\n"

    # 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
@@ -359,18 +359,27 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
        "RAW SCORES", 
        run_data["raw_cross_model_stats"], 
        run_data["raw_model_stats"]
-    )
+    ) + "\n"
    s += log_score_summary(
        "CALIBRATED SCORES", 
        run_data["calibrated_cross_model_stats"],
        run_data["calibrated_model_stats"]
-    )
-
-    logger.info(f"Final Judgemark (raw)   = {final_score_raw:.3f}")
-    logger.info(f"Final Judgemark (cal)  = {final_score_calibrated:.3f}")
+    ) + "\n"
+    s += f"Final Judgemark (raw)   = {final_score_raw:.3f}\n"
+    s += f"Final Judgemark (cal)  = {final_score_calibrated:.3f}\n"
+    logger.info(s)
    return {
-        "final_judgemark_score_raw": final_score_raw,
-        "final_judgemark_score_calibrated": final_score_calibrated,
+        "judgemark_score": final_score_raw,
+        "judgemark_score_calib": final_score_calibrated,
+
+        "stability": raw_norm["kendall_tau_bootstrapped"],
+        "stability_calib": norm["kendall_tau_bootstrapped"],
+
+        "separability": raw_separability,
+        "separability_calib": calibrated_separability,
+
+        "human_correlation": raw_norm["kendall_tau"],
+        "human_correlation_calib": norm["kendall_tau"],
    }


@@ -12,8 +12,9 @@ STABILITY_REPS = 100
 # note the rererence scores are https://old.reddit.com/r/LocalLLaMA/comments/1cd2jco/judgemark_how_well_a_llm_judge_can_evaluate/
 #     correlation with arena elo https://huggingface.co/spaces/lmarena-ai/chatbot-arena-leaderboard
 REFERENCE_MODEL_SCORES = {
+    # "DeepSeek-R1": 1357, 
+    "DeepSeek-R1": 1430, # !! need to update this
    "gpt-4o-2024-11-20": 1402,
-    "DeepSeek-R1": 1357, 
    "gemini-1.5-pro-002": 1333,
    "gemini-1.5-pro-001": 1276,
    "claude-3-5-sonnet-20240620": 1243,
@@ -302,24 +302,24 @@ def apply_landmark_calibration(x, config):

 def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
    """Log a readable summary of score statistics."""
-    s = ""
-    s += f"\n------- {score_type} Summary -------"
-    s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}"
-    s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}"
-    s += f"Pearson r={cross_stats['pearson_r']:.4f}"
-    s += f"Kendall τ={cross_stats['kendall_tau']:.4f}"
-    s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}"
+    s = "\n\n"
+    s += f"\n------- {score_type} Summary -------\n"
+    s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}\n"
+    s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}\n"
+    s += f"Pearson r={cross_stats['pearson_r']:.4f}\n"
+    s += f"Kendall τ={cross_stats['kendall_tau']:.4f}\n"
+    s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}\n"

-    s += "\nModel Scores:"
+    s += "\nModel Scores:\n"
    sorted_models = sorted(
        model_stats.items(),
        key=lambda kv: kv[1]["mean"],
        reverse=True
    )
    for model, stats in sorted_models:
-        line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
+        line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}\n"
        s += line
-    s += "\n------------------------------------"
+    s += "\n------------------------------------\n"
    logger.info(s)
    return s

@@ -296,12 +296,12 @@ def compute_separability_metrics(
    metrics_label["modulated_ci95"] = modulated_ci95    

    # Logging summary
-    s = ""
+    s = "\n\n"
    s += f"\n--- {label.upper()} SEPARABILITY METRICS ---\n"
    s += f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}\n"
-    s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
+    s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): \n"
    s += f"{sum_overlap_magnitude:.3f}\n"
-    s += f"CI99 Overlap pct: "
+    s += f"CI99 Overlap pct: \n"
    s += f"{ci99_overlap_percentage_adjacent_avg:.3f}\n"
    s += f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}\n"
    s += f"Average EMD across all pairs: {emd_data['average']:.3f}\n"
@@ -38,8 +38,9 @@ def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int

                ## openrouter specific
                "provider": {
-                    "require_parameters": True, 
+                "require_parameters": True, 
                   "order": [
+                       "Cerebras",
                       "OpenAI",
                       "Lambda",
                       "DeepSeek",