collect logprobs, and w and r scores

This commit is contained in:
wassname
2025-07-23 18:18:48 +08:00
parent 2bfa3574b7
commit 50290c9d48
2 changed files with 91 additions and 12 deletions
+15 -4
View File
@@ -18,7 +18,7 @@ from judgemark_v2lp.scoring import (
parse_scores, compute_raw_score, compute_detailed_distribution,
compute_model_level_stats, compute_cross_model_stats,
build_landmark_calibration_config, apply_landmark_calibration,
log_score_summary, confidence_interval_95
log_score_summary, confidence_interval_95, compute_ranked_score, compute_weighted_score
)
from judgemark_v2lp.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
from judgemark_v2lp.separability import compute_separability_metrics
@@ -58,12 +58,19 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text:
judge_response = res_json['choices'][0]['message']['content']
logprobs = res_json['choices'][0]['logprobs']['content']
extracted_scores = parse_scores(judge_response, logprobs)
extracted_scores, logp = parse_scores(judge_response, logprobs)
extracted_wscores = compute_weighted_score(logp)
extracted_rscores = compute_ranked_score(logp)
raw_score = compute_raw_score(extracted_scores)
raw_score_w = compute_raw_score(extracted_wscores)
raw_score_r = compute_raw_score(extracted_rscores)
with lock:
storage_dict = {
"parsed_scores": extracted_scores,
"parsed_weighted_scores": extracted_wscores,
"parsed_ranked_scores": extracted_rscores,
"timestamp": datetime.now().isoformat(),
"text_length": text_len
# res_json['usage']['cost']
@@ -71,9 +78,13 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text:
}
if raw_score is not None:
storage_dict["aggregated_score_raw"] = raw_score
storage_dict["aggregated_score_weighted"] = raw_score_w
storage_dict["aggregated_score_ranked"] = raw_score_r
if save_raw_judge_output:
storage_dict["judge_response"] = judge_response
storage_dict["logprobs"] = logprobs
iteration_dict[item_id] = storage_dict
runs[run_key]["results"][model_name][iteration_key] = iteration_dict
save_json_file(runs, runs_file)
+76 -8
View File
@@ -2,12 +2,41 @@ import math
import statistics
import numpy as np
import scipy.stats
from scipy.stats import kendalltau
from loguru import logger
from typing import Dict, List
import re
from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES
from judgemark_v2lp.utils.stats import normalize
def detokenize(tokens: List[str]) -> str:
"""
Very rough undo for common Hugging-Face / SentencePiece / BPE markers:
- GPT-2: 'Ġ' prefix → leading space
- SentencePiece: '' prefix → leading space
- newline marker: 'Ċ' prefix → space
- BERT WordPieces: '##' prefix → no space, just glue on
Falls back to stripping any leading run of those markers, then
inserting a single space before each token that isn't a '##' piece.
"""
out = ""
for t in tokens:
if "Ċ" in t:
t = t.replace("Ċ", "\n")
if t.startswith("##"):
# BERT-style subword: glue to previous
out += t[2:]
elif t.startswith("Ġ") or t.startswith(""):
# GPT-2 or SentencePiece: prefix with space
if out and not out.endswith(" "):
out += " "
out += t[1:]
else:
# Normal token: just append it
out += t
return out.lstrip()
def parse_scores(judge_model_response: str, logprobs: list) -> Dict[str,float]:
"""
Extracts zero or more named numeric scores from a text using a simple Regex pattern:
@@ -20,15 +49,32 @@ def parse_scores(judge_model_response: str, logprobs: list) -> Dict[str,float]:
"Realism Score: 7.5"
"Melodramatic: 2"
"""
scores = {}
scores: Dict[str, float] = {}
logps ={}
pattern = r"(.*?):\s*(?:Score\s+)?(-?\d+(?:\.\d+)?)"
choices = [str(i) for i in range(11)]
window_size = 20
# Look for lines or statements like "Something: 3.5" or "Something Score 3.5"
pattern = r'(.*?):\s*(?:Score\s+)?(-?\d+(?:\.\d+)?)'
matches = re.findall(pattern, judge_model_response)
for match in matches:
metric_name = match[0].strip()
numeric_val = float(match[1])
scores[metric_name] = numeric_val
return scores
for ti, row in enumerate(logprobs):
if row['token'] in choices:
# get previous window (incl) to check for regexp
prev_window = [t['token'] for t in logprobs[max(0, ti-window_size):ti+1]]
prev_text = detokenize(prev_window)
matches = re.findall(pattern, prev_text)
if matches:
match = matches[-1] # take the last match in the window
metric_name = match[0].strip()
numeric_val = float(match[1])
scores[metric_name] = numeric_val
logp_dict = {t['token']:t['logprob'] for t in row['top_logprobs']}
logp_arr = [logp_dict.get(c, -100) for c in choices]
# extra logprob of choices
logps[metric_name] = logp_arr
return scores, logps
def compute_raw_score(scores: Dict[str,float]) -> float:
"""
@@ -273,3 +319,25 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
logger.info(line)
logger.info("------------------------------------")
def compute_weighted_score(logp):
outs = {}
choices = np.arange(11) # Choices are 0-10
for metric, logp_arr in logp.items():
probs = np.exp(logp_arr)
weights = probs / (probs.sum() + 1e-12)
outs[metric] = (weights * choices).sum().item()
return outs
def compute_ranked_score(logp):
outs = {}
choices = np.arange(11) # Choices are 0-10
for metric, logp_arr in logp.items():
res = kendalltau(choices, logp_arr, variant='b')
# correlation weighted by pvalue
decision =(res.correlation-0.5)*res.pvalue+0.5
outs[metric] = decision.item()
return outs