mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
tau with alternative less
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
# Judgemark V2 - Log Probability Evaluation
|
||||
|
||||
A fork of Judgemark that evaluates whether log probability-based scoring methods outperform traditional text-based judgment approaches fo## Interpreting th## Contributing
|
||||
A fork of Judgemark that evaluates whether log probability-based scoring methods outperform traditional text-based judgment approaches for interpreting the results of language models.
|
||||
|
||||
## Contributing
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -9,32 +11,33 @@ This project compares different methods of extracting scores from language model
|
||||
- **Raw**: Traditional text-based scoring (baseline Judgemark method)
|
||||
- **Weighted**: Log probability weighting using normalized choice probabilities (similar to [G-Eval](https://arxiv.org/abs/2303.16634))
|
||||
- **Ranked**: Ranking-based evaluation using Kendall's tau correlation between log probability rankings and score rankings
|
||||
****
|
||||
|
||||
## Results
|
||||
|
||||
|
||||
|
||||
| Method | Score | Score (Normalized) |
|
||||
|----------------|----------------|--------------------|
|
||||
| ranked | 0.77 | 0.817 |
|
||||
| ranked_scaled | 0.77 | 0.817 |
|
||||
| raw (baseline) | 0.678 | 0.731 |
|
||||
| weighted | 0.668 | 0.716 |
|
||||
| ranked_norm | 0.647 | 0.658 |
|
||||
| weighted_norm | 0.562 | 0.551 |
|
||||
|
||||
|
||||
| name | judgemark_score_calib | stability_calib | separability_calib | human_correlation_calib |
|
||||
| ------------- | --------------------- | --------------- | ------------------ | ----------------------- |
|
||||
| ranked_scaled | **0.788** | **1.0** | **0.785** | 0.592 |
|
||||
| published | 0.761 | 0.894 | 0.691 | **0.908** |
|
||||
| ranked | 0.74 | 0.895 | 0.665 | 0.882 |
|
||||
| raw | 0.731 | 0.895 | 0.653 | 0.882 |
|
||||
| weighted | 0.716 | 0.886 | 0.633 | 0.876 |
|
||||
| ranked_norm | 0.575 | 0.644 | 0.506 | 0.781 |
|
||||
| weighted_norm | 0.545 | 0.547 | 0.49 | 0.761 |
|
||||
|---------------| ----------------------|-----------------|--------------------|-------------------------|
|
||||
| ranked_scaled | **0.817** | **1.0** | **0.804** | 0.69 |
|
||||
| ranked | 0.817 | 1.0 | 0.804 | 0.69 |
|
||||
| raw (baseline)| 0.731 | 0.895 | 0.652 | **0.882** |
|
||||
| weighted | 0.716 | 0.879 | 0.635 | 0.876 |
|
||||
| ranked_norm | 0.658 | 0.8 | 0.576 | 0.843 |
|
||||
| weighted_norm | 0.551 | 0.538 | 0.498 | 0.775 |
|
||||
|
||||
*results for DeepSeek R1**
|
||||
*Results for DeepSeek r1*
|
||||
|
||||
|
||||
| Method | Score | Score (Normalized) |
|
||||
| -------------- | ----- | ------------------ |
|
||||
| ranked_scaled | 0.62 | 0.80 |
|
||||
| ranked_norm | 0.65 | 0.74 |
|
||||
| weighted | 0.63 | 0.65 |
|
||||
| raw (baseline) | 0.63 | 0.65 |
|
||||
| weighted_norm | 0.62 | 0.64 |
|
||||
|
||||
*Results for DeepSeek Chat V3 0324*
|
||||
|
||||
The ranking approach performs best, particularly when scaled. This approach treats LLM log probabilities as rankings rather than true probabilities, which aligns better with how sampling methods like greedy and top-k actually work.
|
||||
|
||||
- ranked_scaled: this method is kendall tau (scaled to [0, 10] after normalising by the mean log probs over all samples
|
||||
@@ -65,7 +68,7 @@ Instead of treating log probabilities as probabilities, the ranking method:
|
||||
|
||||
2. Ranks choices by their log probabilities
|
||||
3. Uses Kendall's tau to measure correlation between log probability rankings and expected score rankings
|
||||
4. Achieves high efficiency by extracting complete score distributions from a single token
|
||||
4. Achieves high stability by extracting complete score distributions from a single token
|
||||
|
||||
|
||||
## Setup & Installation
|
||||
@@ -96,29 +99,17 @@ Run the benchmark with the main script:
|
||||
# test
|
||||
uv run python judgemark_v2.py \
|
||||
--judge-model "meta-llama/llama-3.2-3b-instruct" \
|
||||
--samples-file data/judgemark_v2.1_samples.json \
|
||||
--prompts-file data/judge_prompts.json \
|
||||
--runs-file outputs/my_judgemark_runs.json \
|
||||
--threads 1 \
|
||||
--num-runs 1 \
|
||||
--save-raw-judge-output
|
||||
|
||||
# gather data
|
||||
uv run python judgemark_v2.py \
|
||||
--judge-model "qwen/qwen3-235b-a22b" \
|
||||
--samples-file data/judgemark_v2.1_samples.json \
|
||||
--prompts-file data/judge_prompts.json \
|
||||
--runs-file outputs/my_judgemark_runs2.json \
|
||||
--num-runs 1 \
|
||||
--save-raw-judge-output \
|
||||
--threads 8
|
||||
|
||||
uv run python judgemark_v2.py \
|
||||
--judge-model "deepseek/deepseek-r1" \
|
||||
--samples-file data/judgemark_v2.1_samples.json \
|
||||
--prompts-file data/judge_prompts.json \
|
||||
--runs-file outputs/my_judgemark_runs2.json \
|
||||
--num-runs 1 \
|
||||
--save-raw-judge-output
|
||||
--judge-model "deepseek/deepseek-r1"
|
||||
|
||||
# crunch results
|
||||
nbs/02_recomp.ipynb
|
||||
```
|
||||
|
||||
See original repo https://github.com/EQ-bench/Judgemark-v2 for citation, license, and more details.
|
||||
|
||||
+39
-21
@@ -3,7 +3,8 @@ import signal
|
||||
import argparse
|
||||
from loguru import logger
|
||||
import time
|
||||
|
||||
from judgemark_v2lp.benchmark import sanitize_model_name
|
||||
from pathlib import Path
|
||||
|
||||
from judgemark_v2lp.utils.logging_setup import setup_logging, get_verbosity
|
||||
from judgemark_v2lp.utils.file_io import load_json_file
|
||||
@@ -44,12 +45,13 @@ def parse_args():
|
||||
)
|
||||
parser.add_argument(
|
||||
'--runs-file',
|
||||
default="judgemark_v2_runs.json",
|
||||
help='Path to store the Judgemark run results'
|
||||
default=None,
|
||||
help='Path to store the Judgemark run results',
|
||||
type=Path,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--run-id',
|
||||
help='Resume (or create) a run using this base ID, to be combined with the judge model name'
|
||||
help='Resume (or create) a run using this base ID, to be combined with the judge model name',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--threads',
|
||||
@@ -60,7 +62,8 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
'--verbosity',
|
||||
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
||||
help='Set the logging verbosity level'
|
||||
help='Set the logging verbosity level',
|
||||
default='INFO' # Default to INFO if not specified
|
||||
)
|
||||
parser.add_argument(
|
||||
'--num-runs',
|
||||
@@ -70,23 +73,35 @@ def parse_args():
|
||||
)
|
||||
parser.add_argument(
|
||||
'--save-raw-judge-output',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='If set, store the raw judge model output in the results JSON (default: false)'
|
||||
dest='save_raw_judge_output',
|
||||
action='store_false',
|
||||
default=True,
|
||||
help='If provided, do NOT store the raw judge model output in the results JSON (default: store raw judge output)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--score-weighted',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='If set, use weighted scoring for the judge model (default: false)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--score-ranked',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='If set, use ranked logprob scoring for the judge model (default: false)'
|
||||
)
|
||||
return parser.parse_args()
|
||||
# parser.add_argument(
|
||||
# '--score-weighted',
|
||||
# action='store_true',
|
||||
# default=False,
|
||||
# help='If set, use weighted scoring for the judge model (default: false)'
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# '--score-ranked',
|
||||
# action='store_true',
|
||||
# default=False,
|
||||
# help='If set, use ranked logprob scoring for the judge model (default: false)'
|
||||
# )
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.runs_file is None:
|
||||
sanitized_jm = sanitize_model_name(args.judge_model)
|
||||
ts = time.strftime("%Y%m%d_%H%M%S")
|
||||
args.runs_file = Path(f"outputs/{args.run_id or ''}_{sanitized_jm}_{ts}.json")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Register signal handlers
|
||||
@@ -103,6 +118,9 @@ if __name__ == "__main__":
|
||||
verbosity = get_verbosity(args.verbosity)
|
||||
setup_logging(verbosity)
|
||||
logger.debug("Logging initialized")
|
||||
run_name = args.runs_file.stem
|
||||
logger.add(f"outputs/{run_name}.log", level=verbosity, rotation="10 MB", enqueue=True)
|
||||
logger.info(f"Logging to outputs/{run_name}.log")
|
||||
|
||||
# Check that we have an API key for the judge model
|
||||
if not API_KEY:
|
||||
|
||||
@@ -140,6 +140,9 @@ def compute_detailed_distribution(scores):
|
||||
def compute_model_level_stats(scores_by_model, lengths_by_model):
|
||||
model_stats = {}
|
||||
for model_name, scores in scores_by_model.items():
|
||||
if model_name not in lengths_by_model:
|
||||
logger.warning(f"Model {model_name} has no lengths data, skipping.")
|
||||
continue
|
||||
lengths = lengths_by_model[model_name]
|
||||
stats = {
|
||||
"count": len(scores),
|
||||
@@ -207,7 +210,10 @@ def compute_cross_model_stats(scores_by_model_all, scores_by_model_by_iter):
|
||||
|
||||
iteration_pearsons.append(p_r)
|
||||
iteration_kendalls.append(k_tau)
|
||||
|
||||
|
||||
# HACK
|
||||
print('k_tau', means, refs, k_tau)
|
||||
|
||||
# Final correlation = average across iteration-level correlations
|
||||
if iteration_pearsons:
|
||||
pearson_r = statistics.mean(iteration_pearsons)
|
||||
@@ -329,11 +335,39 @@ def compute_weighted_score(logp):
|
||||
choices = np.arange(11) # Choices are 0-10
|
||||
for metric, logp_arr in logp.items():
|
||||
probs = np.exp(logp_arr)
|
||||
# power = 3
|
||||
# powered_probs = probs ** power
|
||||
weights = probs / (probs.sum() + 1e-12)
|
||||
outs[metric] = (weights * choices).sum().item()
|
||||
|
||||
return outs
|
||||
|
||||
def compute_logpweighted_score(logp):
|
||||
outs = {}
|
||||
choices = np.arange(11) # Choices are 0-10
|
||||
for metric, logp_arr in logp.items():
|
||||
# probs = np.exp(logp_arr)
|
||||
# weights = probs / (probs.sum() + 1e-12)
|
||||
outs[metric] = (logp_arr * choices).sum().item()
|
||||
|
||||
return outs
|
||||
|
||||
# def compute_weighted_score(logp, scale=2.0): # Tune scale (>1 for sharpening, e.g., 1.5-5)
|
||||
# outs = {}
|
||||
# choices = np.arange(11) # Choices are 0-10
|
||||
# for metric, logp_arr in logp.items():
|
||||
# # Scale logprobs directly (sharpens for scale >1)
|
||||
# scaled_logp = logp_arr * scale
|
||||
|
||||
# # Apply softmax in log space to get weights (no intermediate prob normalization)
|
||||
# max_logp = np.max(scaled_logp) # For numerical stability
|
||||
# exp_terms = np.exp(scaled_logp - max_logp)
|
||||
# weights = exp_terms / (exp_terms.sum() + 1e-12)
|
||||
|
||||
# # Weighted average
|
||||
# outs[metric] = (weights * choices).sum().item()
|
||||
# return outs
|
||||
|
||||
def compute_ranked_score(logp):
|
||||
outs = {}
|
||||
choices = np.arange(11) # Choices are 0-10
|
||||
@@ -341,13 +375,17 @@ def compute_ranked_score(logp):
|
||||
# res = kendalltau(choices, logp_arr, variant='b')
|
||||
|
||||
# lets just use the common numbers 1,3,5,7,9, as some models like to skip some
|
||||
res = kendalltau(choices, logp_arr, variant='b')
|
||||
res = kendalltau(choices, logp_arr, variant='b', alternative='less', method='asymptotic')
|
||||
# print(res.correlation, res.pvalue, res2.correlation, res2.pvalue)
|
||||
# correlation weighted by pvalue
|
||||
|
||||
|
||||
decision = (res.correlation+1)*5 # scale to 0-10
|
||||
# decision = (2*decision*res.pvalue).clip(0, 10)
|
||||
# decision = (res.correlation+1)*5 # scale to 0-10
|
||||
decision = ((res.correlation + 1)**5 * 5) # power-law emphasise # better
|
||||
k = 5
|
||||
# decision = 5 * (np.exp(res.correlation) - np.exp(-1)) / (np.exp(1) - np.exp(-1)) # meh
|
||||
# decision = 10 / (1 + np.exp(-k * res.correlation)) # better
|
||||
decision = (2*decision*res.pvalue).clip(0, 10)
|
||||
outs[metric] = decision.item()
|
||||
|
||||
return outs
|
||||
|
||||
@@ -14,8 +14,8 @@ HEADERS = {
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
|
||||
RETRY_DELAY = int(os.getenv("RETRY_DELAY", "5"))
|
||||
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "5"))
|
||||
RETRY_DELAY = int(os.getenv("RETRY_DELAY", "9"))
|
||||
|
||||
def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int = MAX_RETRIES) -> str:
|
||||
"""
|
||||
|
||||
@@ -6,7 +6,7 @@ from loguru import logger
|
||||
def setup_logging(verbosity: str):
|
||||
"""Set up logging based on verbosity level."""
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level=verbosity)
|
||||
logger.add(sys.stderr, level=verbosity, enqueue=True)
|
||||
|
||||
def get_verbosity(args_verbosity: str) -> str:
|
||||
"""Determine the verbosity level from command-line or environment."""
|
||||
|
||||
+117
-2241
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,19 @@
|
||||
shape: (6, 9)
|
||||
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
|
||||
│ name ┆ judgemark ┆ judgemark ┆ stability ┆ … ┆ separabil ┆ separabil ┆ human_cor ┆ human_co │
|
||||
│ --- ┆ _score ┆ _score_ca ┆ --- ┆ ┆ ity ┆ ity_calib ┆ relation ┆ rrelatio │
|
||||
│ str ┆ --- ┆ lib ┆ f64 ┆ ┆ --- ┆ --- ┆ --- ┆ n_calib │
|
||||
│ ┆ f64 ┆ --- ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ --- │
|
||||
│ ┆ ┆ f64 ┆ ┆ ┆ ┆ ┆ ┆ f64 │
|
||||
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
|
||||
│ weighted ┆ 0.668 ┆ 0.716 ┆ 0.895 ┆ … ┆ 0.558 ┆ 0.635 ┆ 0.881 ┆ 0.876 │
|
||||
│ raw ┆ 0.678 ┆ 0.731 ┆ 0.895 ┆ … ┆ 0.572 ┆ 0.652 ┆ 0.882 ┆ 0.882 │
|
||||
│ ranked ┆ 0.608 ┆ 0.81 ┆ 0.898 ┆ … ┆ 0.531 ┆ 0.81 ┆ 0.624 ┆ 0.624 │
|
||||
│ ranked_no ┆ 0.575 ┆ 0.588 ┆ 0.897 ┆ … ┆ 0.435 ┆ 0.513 ┆ 0.814 ┆ 0.801 │
|
||||
│ rm ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
|
||||
│ ranked_sc ┆ 0.608 ┆ 0.81 ┆ 0.897 ┆ … ┆ 0.531 ┆ 0.81 ┆ 0.624 ┆ 0.624 │
|
||||
│ aled ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
|
||||
│ weighted_ ┆ 0.562 ┆ 0.551 ┆ 0.897 ┆ … ┆ 0.424 ┆ 0.498 ┆ 0.777 ┆ 0.775 │
|
||||
│ norm ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │
|
||||
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘
|
||||
|
||||
+122
@@ -50,3 +50,125 @@ Final Judgemark (cal) = 0.645
|
||||
| raw | 0.63 | 0.65 |
|
||||
| weighted_norm | 0.62 | 0.64 |
|
||||
| ranked | 0.33 | 0.28 |
|
||||
|
||||
|
||||
UPTO:
|
||||
- why is ranked low on human calib? it's using kendall tau, against reference scores so I though it would be good. It's only using relative order
|
||||
- well I guess ranking is smooth an stable but less pronounced... if we made it more pronounced (**10?, or squared, or softmax) would it be better on all fonts
|
||||
|
||||
## Analysis of Ranked Method Performance Issue
|
||||
|
||||
### **Key Implementation Detail - Correcting Misconceptions**
|
||||
|
||||
**Critical Clarification**: All methods obtain the full 11-choice probability distribution in a **single forward pass** - there are no extra queries or argmax operations. The difference is only in how they collapse the 11-dimensional logprob vector:
|
||||
|
||||
1. **Single prompt** is sent to the judge model
|
||||
2. Model sees 11 choice tokens ["0", "1", ..., "10"] (often in JSON format)
|
||||
3. **All 11 logprobs** are requested in the same forward pass: `logp_arr = [L₀, L₁, ... L₁₀]`
|
||||
4. Different methods collapse this vector differently:
|
||||
|
||||
- **ranked** path: `corr = kendalltau(choices=[0…10], values=logp_arr).correlation`, then `final = (corr+1)*5`. Uses **ordering** of the 11 log-probs.
|
||||
- **weighted** path: `softmax(logp_arr)`-weighted mean: `score = Σ( i * exp(Lᵢ)/Σexp(L) )`
|
||||
- **raw** path: pick the token whose text content is parsed as score (greedy decoding)
|
||||
|
||||
### **Why Ranked Method Fails on Human Correlation**
|
||||
|
||||
Looking at the full results:
|
||||
|
||||
**DeepSeek-R1 (Strong Judge):**
|
||||
- `ranked`: stability_calib = **1.0**, separability_calib = **0.81**, but human_correlation_calib = **0.624**
|
||||
- `raw`: stability_calib = 0.896, separability_calib = 0.652, human_correlation_calib = **0.882**
|
||||
|
||||
**Llama-3.2-3B (Weak Judge):**
|
||||
- `ranked`: stability_calib = **1.0**, separability_calib = **0.809**, but human_correlation_calib = **0.0**
|
||||
- `raw`: stability_calib = 0.0, separability_calib = 0.081, human_correlation_calib = **0.212**
|
||||
|
||||
**Root Cause Analysis:**
|
||||
|
||||
1. **Scale Compression**: The `(correlation+1)*5` transformation maps Kendall's τ from [-1,1] to [0,10], but creates a compressed range around 5.0 for most cases.
|
||||
|
||||
2. **Loss of Discriminative Power**: Most log probability rankings have moderate positive correlations with ideal [0,1,2,...,10], so scores cluster around 6-8, losing ability to distinguish between good/bad models.
|
||||
|
||||
3. **Efficiency vs. Accuracy Trade-off**: While ranked method extracts full 11-choice distribution at zero extra cost, its ordering-based collapse may be too conservative for human preference alignment.
|
||||
|
||||
4. **Paradox**: Ranked method is internally consistent (high stability, separability) but externally invalid (poor human correlation).
|
||||
|
||||
### **Proposed Solutions**
|
||||
|
||||
The intuition about making it "more pronounced" is correct:
|
||||
|
||||
1. **Exponential Scaling**: `decision = ((res.correlation+1)*5) ** 2 / 10`
|
||||
2. **Power Transform**: `decision = (res.correlation ** 3 + 1) * 5`
|
||||
3. **Direct Tau Scaling**: `decision = res.correlation * 10 + 5` (allows negative scores)
|
||||
4. **Sigmoid Enhancement**: Apply sigmoid to stretch middle range
|
||||
|
||||
The fundamental issue: linear scaling `(τ+1)*5` doesn't preserve enough correlation signal to maintain human alignment while creating stable rankings.
|
||||
|
||||
|
||||
| name | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | human_correlation | human_correlation_calib |
|
||||
|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|
|
||||
| weighted | 0.668 | 0.716 | 0.898 | 0.879 | 0.558 | 0.635 | 0.881 | 0.876 |
|
||||
| raw | 0.678 | 0.731 | 0.896 | 0.896 | 0.572 | 0.652 | 0.882 | 0.882 |
|
||||
| ranked | 0.607 | 0.81 | 0.896 | 1.0 | 0.531 | 0.81 | 0.624 | 0.624 |
|
||||
| ranked_norm | 0.575 | 0.588 | 0.897 | 0.677 | 0.435 | 0.513 | 0.814 | 0.801 |
|
||||
| ranked_scaled | 0.607 | 0.81 | 0.896 | 1.0 | 0.531 | 0.81 | 0.624 | 0.624 |
|
||||
| weighted_norm | 0.562 | 0.551 | 0.897 | 0.54 | 0.424 | 0.498 | 0.777 | 0.775 |
|
||||
|
||||
Run ID: 8b031c01-ac20-4f6d-8838-2d8481c61a55__deepseek_deepseek-r1
|
||||
|
||||
|
||||
| name | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | **human_correlation** | human_correlation_calib |
|
||||
|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|
|
||||
| weighted | 0.089 | 0.102 | 0.0 | 0.0 | 0.077 | 0.087 | 0.225 | 0.261 |
|
||||
| raw | 0.088 | 0.089 | 0.0 | 0.0 | 0.074 | 0.081 | 0.229 | 0.212 |
|
||||
| ranked | 0.262 | 0.706 | 0.0 | 1.0 | 0.393 | 0.809 | 0.0 | 0.0 |
|
||||
| ranked_norm | 0.065 | 0.068 | 0.0 | 0.0 | 0.053 | 0.059 | 0.176 | 0.17 |
|
||||
| ranked_scaled | 0.262 | 0.706 | 0.0 | 1.0 | 0.393 | 0.809 | 0.0 | 0.0 |
|
||||
| weighted_norm | 0.028 | 0.046 | 0.0 | 0.0 | 0.042 | 0.061 | 0.0 | 0.033 |
|
||||
|
||||
Run ID: ab20b598-845b-4da7-9f4c-56ec05405e28__meta-llama_llama-3_2-3b-instruct
|
||||
|
||||
# 2025-08-20 05:52:06
|
||||
|
||||
So I'm still confused why rank is worse. And why norm and scaled perform better in some cases and worse in others.
|
||||
|
||||
|
||||
look at raw scoes into tau
|
||||
Processing run 8b031c01-ac20-4f6d-8838-2d8481c61a55__deepseek_deepseek-r1...
|
||||
|
||||
RAW
|
||||
weighted
|
||||
k_tau (7.056521739130434, 5.692083333333334, 6.568333333333333, 6.972083333333333, 6.02625, 5.292916666666667, 4.985, 4.421666666666667, 3.9104166666666664, 5.91625, 5.632083333333333, 7.324166666666667, 5.82875, 7.610833333333333, 8.12625, 4.810416666666667, 5.170833333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8823529411764705
|
||||
calibrated
|
||||
k_tau (6.913741010558691, 4.745233729099284, 6.136213642220673, 6.867702086872284, 5.3350524958555905, 3.9637780431712675, 3.4473454755860238, 2.6185410443378423, 1.9714928350431529, 5.109234456943669, 4.540345540824161, 7.3509579707211765, 4.885496283271738, 7.700297215002195, 8.38722761924037, 3.1579257830331477, 3.8035150809926783) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8823529411764705
|
||||
|
||||
raw
|
||||
k_tau (7.2075, 5.891666666666667, 6.645833333333333, 7.17, 5.872083333333333, 5.295416666666667, 4.9225, 4.42375, 3.9130434782608696, 5.767916666666666, 5.505833333333333, 7.350416666666667, 5.902083333333334, 7.7004166666666665, 8.099166666666667, 4.839166666666666, 5.04625) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.9117647058823529
|
||||
calibrated
|
||||
k_tau (7.116312056737589, 4.941891289390237, 6.232299054373523, 7.107176996608079, 4.875130409086237, 3.880235484714111, 3.2264773280337447, 2.6151016881173845, 1.9454358641574738, 4.691218682067615, 4.2111636112913695, 7.342314215232809, 4.98624091614085, 7.830943570767808, 8.34094973789701, 3.125561954719158, 3.4652997467373283) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.9117647058823529
|
||||
|
||||
|
||||
ranked
|
||||
k_tau (5.61, 3.9, 2.72, 7.3, 5.02, 2.92, 1.66, 3.27, 1.68, 4.63, 2.82, 8.26, 4.92, 7.24, 6.48, 4.32, 3.85) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.588235294117647
|
||||
calibrated
|
||||
k_tau (7.0, 4.3999999999999995, 2.5238095238095246, 8.913207547169812, 6.085271317829456, 3.0, 0.0, 3.5, 0.04761904761904766, 5.480620155038759, 2.761904761904762, 10.0, 5.930232558139535, 8.845283018867924, 7.9849056603773585, 5.0, 4.328571428571428) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.588235294117647
|
||||
|
||||
|
||||
ranked norm
|
||||
k_tau (7.248333333333333, 5.461666666666667, 6.952083333333333, 7.414166666666667, 5.75625, 4.51375, 3.9595833333333332, 3.9333333333333336, 4.483478260869565, 5.414166666666667, 5.159166666666667, 6.91875, 5.63875, 6.787083333333333, 6.379583333333334, 4.099166666666666, 4.35) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7058823529411764
|
||||
calibrated
|
||||
k_tau (7.477241596803178, 4.977537889200114, 7.178479071511619, 7.701605757701165, 5.445191021992888, 3.600066247674374, 2.6228312425887785, 2.661732775182972, 3.348692105366149, 4.871257993125308, 4.464827657652036, 7.146088400456735, 5.244763232721369, 6.975696491248189, 6.218798140275223, 2.869811090813712, 3.158093516403424) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.6911764705882353
|
||||
|
||||
|
||||
ranked scaled
|
||||
k_tau (5.61, 3.9, 2.72, 7.3, 5.02, 2.92, 1.66, 3.27, 1.68, 4.63, 2.82, 8.26, 4.92, 7.24, 6.48, 4.32, 3.85) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.588235294117647
|
||||
calibrated
|
||||
k_tau (7.0, 4.3999999999999995, 2.5238095238095246, 8.913207547169812, 6.085271317829456, 3.0, 0.0, 3.5, 0.04761904761904766, 5.480620155038759, 2.761904761904762, 10.0, 5.930232558139535, 8.845283018867924, 7.9849056603773585, 5.0, 4.328571428571428) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.588235294117647
|
||||
|
||||
|
||||
weightned norm
|
||||
k_tau (7.73375, 5.875, 7.0183333333333335, 7.793333333333333, 5.9591666666666665, 5.34625, 5.1425, 5.205, 5.20695652173913, 5.9425, 5.815416666666667, 7.7941666666666665, 6.183333333333334, 7.769166666666667, 8.17125, 5.407916666666667, 5.335833333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8382352941176471
|
||||
calibrated
|
||||
k_tau (7.230462091065811, 4.168466637432468, 6.189414960216487, 7.470368270765096, 4.300410881532898, 3.50188667965537, 3.1950628663490654, 3.3781083774570995, 3.288182342794895, 4.220391879303791, 4.123169536875159, 7.485909549004787, 4.815368181010978, 7.399227345259091, 7.961157687087653, 3.616137782509102, 3.43724307883553) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8529411764705882
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user