deepseek-r1

This commit is contained in:
wassname
2025-07-28 18:34:04 +08:00
parent 1c0cce874d
commit e0a45b36df
7 changed files with 1305 additions and 257 deletions
+30 -9
View File
@@ -9,17 +9,29 @@ This project compares different methods of extracting scores from language model
- **Raw**: Traditional text-based scoring (baseline Judgemark method)
- **Weighted**: Log probability weighting using normalized choice probabilities (similar to [G-Eval](https://arxiv.org/abs/2303.16634))
- **Ranked**: Ranking-based evaluation using Kendall's tau correlation between log probability rankings and score rankings
****
## Results
| name | judgemark_score_calib | stability_calib | separability_calib | human_correlation_calib |
| ------------- | --------------------- | --------------- | ------------------ | ----------------------- |
| ranked_scaled | **0.788** | **1.0** | **0.785** | 0.592 |
| published | 0.761 | 0.894 | 0.691 | **0.908** |
| ranked | 0.74 | 0.895 | 0.665 | 0.882 |
| raw | 0.731 | 0.895 | 0.653 | 0.882 |
| weighted | 0.716 | 0.886 | 0.633 | 0.876 |
| ranked_norm | 0.575 | 0.644 | 0.506 | 0.781 |
| weighted_norm | 0.545 | 0.547 | 0.49 | 0.761 |
| Method | Score | Score (Normalized) |
|---------------|----------|------------|
| ranked_scaled | 0.62 | 0.80 |
| ranked_norm | 0.65 | 0.74 |
| weighted | 0.63 | 0.65 |
| raw (baseline)| 0.63 | 0.65 |
| weighted_norm | 0.62 | 0.64 |
*results for DeepSeek R1**
| Method | Score | Score (Normalized) |
| -------------- | ----- | ------------------ |
| ranked_scaled | 0.62 | 0.80 |
| ranked_norm | 0.65 | 0.74 |
| weighted | 0.63 | 0.65 |
| raw (baseline) | 0.63 | 0.65 |
| weighted_norm | 0.62 | 0.64 |
*Results for DeepSeek Chat V3 0324*
@@ -92,7 +104,16 @@ uv run python judgemark_v2.py \
--save-raw-judge-output
uv run python judgemark_v2.py \
--judge-model "deepseek/deepseek-chat-v3-0324" \
--judge-model "qwen/qwen3-235b-a22b" \
--samples-file data/judgemark_v2.1_samples.json \
--prompts-file data/judge_prompts.json \
--runs-file outputs/my_judgemark_runs2.json \
--num-runs 1 \
--save-raw-judge-output \
--threads 8
uv run python judgemark_v2.py \
--judge-model "deepseek/deepseek-r1" \
--samples-file data/judgemark_v2.1_samples.json \
--prompts-file data/judge_prompts.json \
--runs-file outputs/my_judgemark_runs2.json \
+19 -10
View File
@@ -224,8 +224,8 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
# 6. Separability metrics
s1, _ = compute_separability_metrics(run_data, raw_scores_by_model_all, label="raw")
s2, _ = compute_separability_metrics(run_data, calibrated_scores_by_model_all, label="calibrated")
s += s1
s += s2
s += s1 + "\n"
s += s2 + "\n"
# 8. Compute iteration stability for raw & calibrated
compute_iteration_stability(run_data, label="raw")
@@ -235,7 +235,7 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
s += "Score stability (RAW)\n"
s += f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}\n"
s += "Score stability (CALIBRATED)\n"
s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
s += f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} \n"
s += f"({run_data['calibrated_cross_model_stats']['kendall_tau']})\n"
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
@@ -359,18 +359,27 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
"RAW SCORES",
run_data["raw_cross_model_stats"],
run_data["raw_model_stats"]
)
) + "\n"
s += log_score_summary(
"CALIBRATED SCORES",
run_data["calibrated_cross_model_stats"],
run_data["calibrated_model_stats"]
)
logger.info(f"Final Judgemark (raw) = {final_score_raw:.3f}")
logger.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}")
) + "\n"
s += f"Final Judgemark (raw) = {final_score_raw:.3f}\n"
s += f"Final Judgemark (cal) = {final_score_calibrated:.3f}\n"
logger.info(s)
return {
"final_judgemark_score_raw": final_score_raw,
"final_judgemark_score_calibrated": final_score_calibrated,
"judgemark_score": final_score_raw,
"judgemark_score_calib": final_score_calibrated,
"stability": raw_norm["kendall_tau_bootstrapped"],
"stability_calib": norm["kendall_tau_bootstrapped"],
"separability": raw_separability,
"separability_calib": calibrated_separability,
"human_correlation": raw_norm["kendall_tau"],
"human_correlation_calib": norm["kendall_tau"],
}
+2 -1
View File
@@ -12,8 +12,9 @@ STABILITY_REPS = 100
# note the rererence scores are https://old.reddit.com/r/LocalLLaMA/comments/1cd2jco/judgemark_how_well_a_llm_judge_can_evaluate/
# correlation with arena elo https://huggingface.co/spaces/lmarena-ai/chatbot-arena-leaderboard
REFERENCE_MODEL_SCORES = {
# "DeepSeek-R1": 1357,
"DeepSeek-R1": 1430, # !! need to update this
"gpt-4o-2024-11-20": 1402,
"DeepSeek-R1": 1357,
"gemini-1.5-pro-002": 1333,
"gemini-1.5-pro-001": 1276,
"claude-3-5-sonnet-20240620": 1243,
+10 -10
View File
@@ -302,24 +302,24 @@ def apply_landmark_calibration(x, config):
def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
"""Log a readable summary of score statistics."""
s = ""
s += f"\n------- {score_type} Summary -------"
s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}"
s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}"
s += f"Pearson r={cross_stats['pearson_r']:.4f}"
s += f"Kendall τ={cross_stats['kendall_tau']:.4f}"
s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}"
s = "\n\n"
s += f"\n------- {score_type} Summary -------\n"
s += f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}\n"
s += f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}\n"
s += f"Pearson r={cross_stats['pearson_r']:.4f}\n"
s += f"Kendall τ={cross_stats['kendall_tau']:.4f}\n"
s += f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}\n"
s += "\nModel Scores:"
s += "\nModel Scores:\n"
sorted_models = sorted(
model_stats.items(),
key=lambda kv: kv[1]["mean"],
reverse=True
)
for model, stats in sorted_models:
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}\n"
s += line
s += "\n------------------------------------"
s += "\n------------------------------------\n"
logger.info(s)
return s
+3 -3
View File
@@ -296,12 +296,12 @@ def compute_separability_metrics(
metrics_label["modulated_ci95"] = modulated_ci95
# Logging summary
s = ""
s = "\n\n"
s += f"\n--- {label.upper()} SEPARABILITY METRICS ---\n"
s += f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}\n"
s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
s += f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): \n"
s += f"{sum_overlap_magnitude:.3f}\n"
s += f"CI99 Overlap pct: "
s += f"CI99 Overlap pct: \n"
s += f"{ci99_overlap_percentage_adjacent_avg:.3f}\n"
s += f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}\n"
s += f"Average EMD across all pairs: {emd_data['average']:.3f}\n"
+2 -1
View File
@@ -38,8 +38,9 @@ def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int
## openrouter specific
"provider": {
"require_parameters": True,
"require_parameters": True,
"order": [
"Cerebras",
"OpenAI",
"Lambda",
"DeepSeek",
+1239 -223
View File
File diff suppressed because one or more lines are too long