From 50290c9d48082b2b0b66b4143bc45a192767c309 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Wed, 23 Jul 2025 18:18:48 +0800 Subject: [PATCH] collect logprobs, and w and r scores --- judgemark_v2lp/benchmark.py | 19 +++++++-- judgemark_v2lp/scoring.py | 84 +++++++++++++++++++++++++++++++++---- 2 files changed, 91 insertions(+), 12 deletions(-) diff --git a/judgemark_v2lp/benchmark.py b/judgemark_v2lp/benchmark.py index eaf13bd..1e6623c 100644 --- a/judgemark_v2lp/benchmark.py +++ b/judgemark_v2lp/benchmark.py @@ -18,7 +18,7 @@ from judgemark_v2lp.scoring import ( parse_scores, compute_raw_score, compute_detailed_distribution, compute_model_level_stats, compute_cross_model_stats, build_landmark_calibration_config, apply_landmark_calibration, - log_score_summary, confidence_interval_95 + log_score_summary, confidence_interval_95, compute_ranked_score, compute_weighted_score ) from judgemark_v2lp.scoring import compute_detailed_distribution, compute_detailed_distribution # etc from judgemark_v2lp.separability import compute_separability_metrics @@ -58,12 +58,19 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: judge_response = res_json['choices'][0]['message']['content'] logprobs = res_json['choices'][0]['logprobs']['content'] - extracted_scores = parse_scores(judge_response, logprobs) + extracted_scores, logp = parse_scores(judge_response, logprobs) + extracted_wscores = compute_weighted_score(logp) + extracted_rscores = compute_ranked_score(logp) + raw_score = compute_raw_score(extracted_scores) - + raw_score_w = compute_raw_score(extracted_wscores) + raw_score_r = compute_raw_score(extracted_rscores) + with lock: storage_dict = { "parsed_scores": extracted_scores, + "parsed_weighted_scores": extracted_wscores, + "parsed_ranked_scores": extracted_rscores, "timestamp": datetime.now().isoformat(), "text_length": text_len # res_json['usage']['cost'] @@ -71,9 +78,13 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: } if raw_score is not None: storage_dict["aggregated_score_raw"] = raw_score + storage_dict["aggregated_score_weighted"] = raw_score_w + storage_dict["aggregated_score_ranked"] = raw_score_r + if save_raw_judge_output: storage_dict["judge_response"] = judge_response - + storage_dict["logprobs"] = logprobs + iteration_dict[item_id] = storage_dict runs[run_key]["results"][model_name][iteration_key] = iteration_dict save_json_file(runs, runs_file) diff --git a/judgemark_v2lp/scoring.py b/judgemark_v2lp/scoring.py index 71d2afa..c6f132a 100644 --- a/judgemark_v2lp/scoring.py +++ b/judgemark_v2lp/scoring.py @@ -2,12 +2,41 @@ import math import statistics import numpy as np import scipy.stats +from scipy.stats import kendalltau from loguru import logger from typing import Dict, List import re from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES from judgemark_v2lp.utils.stats import normalize +def detokenize(tokens: List[str]) -> str: + """ + Very rough undo for common Hugging-Face / SentencePiece / BPE markers: + - GPT-2: 'Ġ' prefix → leading space + - SentencePiece: '▁' prefix → leading space + - newline marker: 'Ċ' prefix → space + - BERT WordPieces: '##' prefix → no space, just glue on + Falls back to stripping any leading run of those markers, then + inserting a single space before each token that isn't a '##' piece. + """ + out = "" + for t in tokens: + if "Ċ" in t: + t = t.replace("Ċ", "\n") + + if t.startswith("##"): + # BERT-style subword: glue to previous + out += t[2:] + elif t.startswith("Ġ") or t.startswith("▁"): + # GPT-2 or SentencePiece: prefix with space + if out and not out.endswith(" "): + out += " " + out += t[1:] + else: + # Normal token: just append it + out += t + return out.lstrip() + def parse_scores(judge_model_response: str, logprobs: list) -> Dict[str,float]: """ Extracts zero or more named numeric scores from a text using a simple Regex pattern: @@ -20,15 +49,32 @@ def parse_scores(judge_model_response: str, logprobs: list) -> Dict[str,float]: "Realism Score: 7.5" "Melodramatic: 2" """ - scores = {} + scores: Dict[str, float] = {} + logps ={} + pattern = r"(.*?):\s*(?:Score\s+)?(-?\d+(?:\.\d+)?)" + choices = [str(i) for i in range(11)] + window_size = 20 # Look for lines or statements like "Something: 3.5" or "Something Score 3.5" - pattern = r'(.*?):\s*(?:Score\s+)?(-?\d+(?:\.\d+)?)' - matches = re.findall(pattern, judge_model_response) - for match in matches: - metric_name = match[0].strip() - numeric_val = float(match[1]) - scores[metric_name] = numeric_val - return scores + for ti, row in enumerate(logprobs): + if row['token'] in choices: + # get previous window (incl) to check for regexp + prev_window = [t['token'] for t in logprobs[max(0, ti-window_size):ti+1]] + prev_text = detokenize(prev_window) + matches = re.findall(pattern, prev_text) + if matches: + match = matches[-1] # take the last match in the window + metric_name = match[0].strip() + numeric_val = float(match[1]) + + scores[metric_name] = numeric_val + + logp_dict = {t['token']:t['logprob'] for t in row['top_logprobs']} + logp_arr = [logp_dict.get(c, -100) for c in choices] + + # extra logprob of choices + logps[metric_name] = logp_arr + + return scores, logps def compute_raw_score(scores: Dict[str,float]) -> float: """ @@ -273,3 +319,25 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict): line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}" logger.info(line) logger.info("------------------------------------") + + +def compute_weighted_score(logp): + outs = {} + choices = np.arange(11) # Choices are 0-10 + for metric, logp_arr in logp.items(): + probs = np.exp(logp_arr) + weights = probs / (probs.sum() + 1e-12) + outs[metric] = (weights * choices).sum().item() + + return outs + +def compute_ranked_score(logp): + outs = {} + choices = np.arange(11) # Choices are 0-10 + for metric, logp_arr in logp.items(): + res = kendalltau(choices, logp_arr, variant='b') + # correlation weighted by pvalue + decision =(res.correlation-0.5)*res.pvalue+0.5 + outs[metric] = decision.item() + + return outs