logger etc

This commit is contained in:
wassname
2025-07-23 17:22:36 +08:00
parent 5615c809e1
commit 9bf406768a
12 changed files with 530 additions and 118 deletions
+24 -2
View File
@@ -1,7 +1,28 @@
Fork of judgemark to see if using weighted logprob, or ranklogprob work better than the current method
Changes
- openrouters only
- get logprobs
- added options
- `--score-weighted`
- `--score-ranklog`
```bash
python judgemark_v2.py \
--judge-model "openai/gpt-4o-mini" \
--samples-file data/judgemark_v2.1_samples.json \
--prompts-file data/judge_prompts.json \
--runs-file my_judgemark_runs.json \
--threads 1 \
--num-runs 1 \
--save-raw-judge-output
```
## Results
TODO
----
# Judgemark V2
@@ -23,14 +44,15 @@ The Judgemark leaderboard can be found here: [https://eqbench.com/judgemark-v2.h
1. **Clone the repository:**
```bash
git clone https://github.com/EQ-bench/Judgemark-v2.git
git clone https://github.com/wassname/Judgemark-v2lp.git
cd Judgemark-v2
```
2. **Install Python dependencies** (make sure youre on Python 3.9+):
```bash
pip install -r requirements.txt
uv sync
. ./venv/bin/activate # Activate the virtual environment
```
3. **Set up environment variables** to include your judge models API credentials. For example, if youre using OpenAI-compatible endpoints:
+22 -10
View File
@@ -1,7 +1,7 @@
import sys
import signal
import argparse
import logging
from loguru import logger
import time
@@ -16,13 +16,13 @@ def signal_handler(signum, frame):
"""Handle interrupt signals (SIGINT, SIGTERM)."""
global executor, should_exit
print(f"\n[DEBUG] Signal {signum} caught!")
logging.warning("Signal handler called")
logger.warning("Signal handler called")
should_exit = True
time.sleep(0.1) # Give workers a moment to see the flag
if executor:
logging.info("Shutting down executor from signal handler")
logger.info("Shutting down executor from signal handler")
executor.shutdown(wait=False)
logging.info("Executor shutdown complete")
logger.info("Executor shutdown complete")
sys.exit(1)
def parse_args():
@@ -54,7 +54,7 @@ def parse_args():
parser.add_argument(
'--threads',
type=int,
default=6,
default=0,
help='Number of threads to use'
)
parser.add_argument(
@@ -74,6 +74,18 @@ def parse_args():
default=False,
help='If set, store the raw judge model output in the results JSON (default: false)'
)
parser.add_argument(
'--score-weighted',
action='store_true',
default=False,
help='If set, use weighted scoring for the judge model (default: false)'
)
parser.add_argument(
'--score-ranked',
action='store_true',
default=False,
help='If set, use ranked logprob scoring for the judge model (default: false)'
)
return parser.parse_args()
if __name__ == "__main__":
@@ -90,18 +102,18 @@ if __name__ == "__main__":
# Setup logging
verbosity = get_verbosity(args.verbosity)
setup_logging(verbosity)
logging.debug("Logging initialized")
logger.debug("Logging initialized")
# Check that we have an API key for the judge model
if not API_KEY:
logging.critical("No OPENAI_API_KEY found in environment variables.")
logger.critical("No OPENAI_API_KEY found in environment variables.")
raise ValueError("OPENAI_API_KEY not found in environment variables.")
run_ids = []
for i in range(1, args.num_runs + 1):
if should_exit:
break
logging.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}")
logger.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}")
rid = run_judgemark_v2(
judge_model=args.judge_model,
samples_file=args.samples_file,
@@ -115,11 +127,11 @@ if __name__ == "__main__":
# Finally, print summary
runs = load_json_file(args.runs_file)
logging.info("\nAll Judgemark-v2 runs completed:")
logger.info("\nAll Judgemark-v2 runs completed:")
print("\nAll Judgemark-v2 runs completed:")
for rid in run_ids:
rd = runs.get(rid, {})
final_score = rd.get("final_judgemark_score", "N/A")
logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
logger.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
print(f"Run ID: {rid}")
print(f"Final Judgemark-v2 Score: {final_score}")
+75 -56
View File
@@ -3,7 +3,7 @@ import re
import uuid
import time
import signal
import logging
from loguru import logger
import threading
import concurrent.futures
from datetime import datetime
@@ -14,15 +14,15 @@ from collections import defaultdict
from judgemark_v2lp.utils.file_io import load_json_file, save_json_file
from judgemark_v2lp.utils.api import send_to_judge_model
from judgemark_v2lp.utils.visualization import create_side_by_side_score_charts
from judgemark_v2lp.core.scoring import (
from judgemark_v2lp.scoring import (
parse_scores, compute_raw_score, compute_detailed_distribution,
compute_model_level_stats, compute_cross_model_stats,
build_landmark_calibration_config, apply_landmark_calibration,
log_score_summary, confidence_interval_95
)
from judgemark_v2lp.core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
from judgemark_v2lp.core.separability import compute_separability_metrics
from judgemark_v2lp.core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
from judgemark_v2lp.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
from judgemark_v2lp.separability import compute_separability_metrics
from judgemark_v2lp.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y
from judgemark_v2lp.utils.state import should_exit, executor
@@ -74,12 +74,12 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text:
save_json_file(runs, runs_file)
if raw_score is not None:
logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
logger.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
else:
logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
logger.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
except Exception as e:
logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
logger.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
with lock:
iteration_dict[item_id] = {
"error": str(e),
@@ -214,10 +214,10 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
compute_iteration_stability(run_data, label="calibrated")
random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
logging.info("Score stability (RAW)")
logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
logging.info("Score stability (CALIBRATED)")
logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
logger.info("Score stability (RAW)")
logger.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
logger.info("Score stability (CALIBRATED)")
logger.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")
# 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
@@ -348,8 +348,8 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
run_data["calibrated_model_stats"]
)
logging.info(f"Final Judgemark (raw) = {final_score_raw:.3f}")
logging.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}")
logger.info(f"Final Judgemark (raw) = {final_score_raw:.3f}")
logger.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}")
def sanitize_model_name(name: str) -> str:
@@ -367,7 +367,7 @@ def run_judgemark_v2(
) -> str:
global executor, should_exit
logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
logger.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
runs = load_json_file(runs_file)
# Form the run key using run_id + "__" + sanitized judge model
@@ -426,9 +426,9 @@ def run_judgemark_v2(
})
if items_to_process:
logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
logger.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
else:
logging.info(f"No items to process in existing run {run_key}")
logger.info(f"No items to process in existing run {run_key}")
else:
# New run - process all items
@@ -448,47 +448,66 @@ def run_judgemark_v2(
# Ensure concurrency lock
lock = threading.Lock()
# Process any items that need retrying
all_futures = []
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
executor = exec_
if items_to_process:
# Process all items (either retries or new run)
for item in items_to_process:
if should_exit:
break
all_futures.append(
executor.submit(
process_sample,
item["model_name"],
item["iteration_key"],
item["item_id"],
item["item_text"],
item["prompt_template"],
run_key,
runs,
runs_file,
lock,
judge_model,
save_raw_judge_output
)
)
if num_threads <= 1:
# Single-threaded mode
for item in items_to_process:
if should_exit:
break
process_sample(
item["model_name"],
item["iteration_key"],
item["item_id"],
item["item_text"],
item["prompt_template"],
run_key,
runs,
runs_file,
lock,
judge_model,
save_raw_judge_output
)
else:
# Process any items that need retrying
all_futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
executor = exec_
# Display progress bar for tasks
for f in tqdm(concurrent.futures.as_completed(all_futures),
total=len(all_futures), desc="Judging", leave=True):
if should_exit:
break
try:
f.result()
except Exception as exc:
logging.error(f"Exception in worker thread: {exc}")
if items_to_process:
# Process all items (either retries or new run)
for item in items_to_process:
if should_exit:
break
all_futures.append(
executor.submit(
process_sample,
item["model_name"],
item["iteration_key"],
item["item_id"],
item["item_text"],
item["prompt_template"],
run_key,
runs,
runs_file,
lock,
judge_model,
save_raw_judge_output
)
)
# Display progress bar for tasks
for f in tqdm(concurrent.futures.as_completed(all_futures),
total=len(all_futures), desc="Judging", leave=True):
if should_exit:
break
try:
f.result()
except Exception as exc:
logger.error(f"Exception in worker thread: {exc}")
except KeyboardInterrupt:
logging.warning("KeyboardInterrupt caught in main thread.")
logger.warning("KeyboardInterrupt caught in main thread.")
should_exit = True
time.sleep(0.1)
finally:
@@ -513,9 +532,9 @@ def run_judgemark_v2(
save_json_file(runs, runs_file)
if executor:
logging.info("Shutting down executor")
logger.info("Shutting down executor")
executor.shutdown(wait=False)
executor = None
logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
logger.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
return run_key
+10 -10
View File
@@ -2,7 +2,7 @@ import math
import statistics
import numpy as np
import scipy.stats
import logging
from loguru import logger
from typing import Dict, List
import re
from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES
@@ -256,14 +256,14 @@ def apply_landmark_calibration(x, config):
def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
"""Log a readable summary of score statistics."""
logging.info(f"\n------- {score_type} Summary -------")
logging.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
logging.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
logging.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
logging.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
logging.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
logger.info(f"\n------- {score_type} Summary -------")
logger.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
logger.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
logger.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
logger.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
logger.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
logging.info("\nModel Scores:")
logger.info("\nModel Scores:")
sorted_models = sorted(
model_stats.items(),
key=lambda kv: kv[1]["mean"],
@@ -271,5 +271,5 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
)
for model, stats in sorted_models:
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
logging.info(line)
logging.info("------------------------------------")
logger.info(line)
logger.info("------------------------------------")
+8 -8
View File
@@ -1,5 +1,5 @@
import logging
from loguru import logger
import math
import statistics
import numpy as np
@@ -296,13 +296,13 @@ def compute_separability_metrics(
metrics_label["modulated_ci95"] = modulated_ci95
# Logging summary
logging.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
logging.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
logging.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
logger.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
logger.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
logger.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
f"{sum_overlap_magnitude:.3f}")
logging.info(f"CI99 Overlap pct: "
logger.info(f"CI99 Overlap pct: "
f"{ci99_overlap_percentage_adjacent_avg:.3f}")
logging.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
logging.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
logging.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
logger.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
logger.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
logger.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
+7 -7
View File
@@ -1,4 +1,4 @@
import logging
from loguru import logger
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
@@ -368,12 +368,12 @@ def process_stability_test_item(model_name, iteration_key, item_id, item_text, p
# Only return actual valid scores, never None
return item_score if isinstance(item_score, (int, float)) and item_score > 0.0 else 0.0
except Exception as e:
logging.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}")
logger.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}")
return 0.0
def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, runs_file, lock, num_threads):
"""Run stability test, retrying any missing entries to reach STABILITY_REPS per item."""
logging.info("Running stability test for selected items...")
logger.info("Running stability test for selected items...")
if "stability_test_results" not in run_data:
run_data["stability_test_results"] = {}
@@ -402,13 +402,13 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs,
"key_name": key_name
})
logging.info(f"Need {needed_count} more stability test results for {key_name}")
logger.info(f"Need {needed_count} more stability test results for {key_name}")
# Clean up existing results, keeping only valid scores
run_data["stability_test_results"][key_name] = valid_results
if not items_to_process:
logging.info("All stability test items already have complete results")
logger.info("All stability test items already have complete results")
return
with ThreadPoolExecutor(max_workers=num_threads) as exec_:
@@ -443,6 +443,6 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs,
run_data["stability_test_results"][key_name].append(score)
save_json_file(runs, runs_file)
else:
logging.warning(f"Got invalid score for stability item {key_name}, will need retry")
logger.warning(f"Got invalid score for stability item {key_name}, will need retry")
except Exception as exc:
logging.error(f"Exception in stability test: {exc}")
logger.error(f"Exception in stability test: {exc}")
+21 -7
View File
@@ -1,6 +1,6 @@
import os
import time
import logging
from loguru import logger
import requests
from typing import List, Dict
from dotenv import load_dotenv
@@ -32,23 +32,37 @@ def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int
"temperature": 0.5,
"top_k": 3,
"max_tokens": 8096,
#"provider": {
"logprobs": True,
"top_logprobs": 20,
## openrouter specific
"provider": {
"require_parameters": True,
# "order": [
# "DeepSeek",
# "DeepInfra",
# "Nebius"
# ],
# "allow_fallbacks": False
#}
},
"usage": {"include": True},
}
response = requests.post(BASE_URL, headers=HEADERS, json=data)
response.raise_for_status()
res_json = response.json()
if "error" in res_json:
raise requests.exceptions.HTTPError(res_json['error'])
response.raise_for_status()
return res_json['choices'][0]['message']['content']
except Exception as e:
logging.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}")
try:
logger.debug(response.text)
except:
pass
logger.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}")
# TODO print response header or erro
if attempt == max_retries:
logging.critical(f"Max retries reached for judge model {judge_model}")
logger.critical(f"Max retries reached for judge model {judge_model}")
raise
time.sleep(RETRY_DELAY)
return ""
return ""
+3 -3
View File
@@ -1,5 +1,5 @@
import json
import logging
from loguru import logger
def load_json_file(file_path: str) -> dict:
"""Loads a JSON file (returns empty if not found)."""
@@ -7,11 +7,11 @@ def load_json_file(file_path: str) -> dict:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
logging.warning(f"File {file_path} not found, returning empty dict.")
logger.warning(f"File {file_path} not found, returning empty dict.")
return {}
def save_json_file(data: dict, file_path: str):
"""Saves a dict to disk as JSON."""
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
logging.debug(f"Saved JSON data to {file_path}")
logger.debug(f"Saved JSON data to {file_path}")
+6 -15
View File
@@ -1,24 +1,15 @@
import os
import logging
import sys
# from loguru import logger
from loguru import logger
def setup_logging(verbosity: str):
"""Set up logging based on verbosity level."""
log_levels = {
'DEBUG': logging.DEBUG,
'INFO': logging.INFO,
'WARNING': logging.WARNING,
'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL
}
log_level = log_levels.get(verbosity.upper(), logging.INFO)
logging.basicConfig(
level=log_level,
format='%(asctime)s - %(levelname)s - %(threadName)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger.remove()
logger.add(sys.stderr, level=verbosity)
def get_verbosity(args_verbosity: str) -> str:
"""Determine the verbosity level from command-line or environment."""
if args_verbosity:
return args_verbosity
return os.getenv("LOG_VERBOSITY", "INFO")
return os.getenv("LOG_VERBOSITY", "INFO")
+329
View File
@@ -0,0 +1,329 @@
{
"8fd38fec-5b00-4199-a74f-db423a762d8f__openai_gpt-4o-mini": {
"judge_model": "openai/gpt-4o-mini",
"start_time": "2025-07-23T17:11:03.844488",
"status": "running",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"results": {
"claude-3-5-sonnet-20240620": {
"1": {
"2": {
"parsed_scores": {
"Original; Not Derivative": 8.0,
"Meaningful Integration of Political and Social Context": 9.0,
"Nuanced and Insightful Portrayal of Gladiator's Inner Life": 9.0,
"Reads Like Part of a Larger Story": 8.0,
"Authentic and Engrossing Ancient Roman Setting": 9.0,
"Vivid and Immersive Sensory Details": 9.0,
"Imagery and Descriptive Quality": 8.0,
"Elegant Prose": 8.0,
"Emotionally Engaging": 9.0,
"Emotionally Complex": 8.0,
"Coherent": 10.0,
"Adherence to Instructions": 9.0,
"Believable Character Actions": 9.0,
"Nuanced Characters": 8.0,
"Consistent Voice/Tone of Writing": 9.0,
"Meandering": 1.0,
"Weak Dialogue": 2.0,
"Tell-Don't-Show": 1.0,
"Unsurprising or Uncreative": 2.0,
"Amateurish": 1.0,
"Purple Prose": 2.0,
"Overwrought": 1.0,
"Incongruent Ending Positivity": 1.0,
"Unearned Transformations": 1.0,
"Well-earned Lightness or Darkness": 8.0,
"Sentences Flow Naturally": 9.0,
"Overall Reader Engagement": 9.0,
"Overall Impression": 9.0
},
"timestamp": "2025-07-23T17:13:06.569955",
"text_length": 5262,
"aggregated_score_raw": 8.68,
"judge_response": "[Analysis]\n\nThe test model's response offers a vivid and immersive portrayal of a day in the life of a gladiator, skillfully integrating sensory details to create an authentic ancient Roman setting. The narrative captures the protagonist's physical and emotional struggles, reflecting on the larger social and political issues of the time, such as the disparity between the patricians and the plebeians. The inner life of the gladiator is nuanced and insightful, revealing a deep sense of resentment and longing for freedom, which adds complexity to the character.\n\nThe writing reads cohesively as part of a larger story, with a clear trajectory that hints at future developments in the protagonist's journey. The sensory details are particularly strong, with effective imagery that engages the reader's senses and brings the setting to life. The prose is generally elegant, with a good command of language that avoids excessive ornamentation, although there are moments where the description could be trimmed for tighter pacing.\n\nEmotionally, the piece resonates well, as it explores themes of identity, struggle, and the desire for agency, making it relatable and compelling. The character's actions and reactions feel believable and consistent with his circumstances, and the voice throughout remains steady and appropriate for the setting.\n\nWhile the dialogue is minimal, it serves its purpose, providing insights into character relationships and the gladiatorial world without feeling forced. The narrative does not meander, maintaining focus on the protagonist's experiences and thoughts. There are no instances of unearned positivity or transformations, and the ending feels congruent with the overall tone of the piece.\n\nOverall, the response is engaging and well-crafted, successfully meeting the prompt's requirements while delivering a strong emotional and thematic impact.\n\n[Scores]\n\nOriginal; Not Derivative: 8 \nMeaningful Integration of Political and Social Context: 9 \nNuanced and Insightful Portrayal of Gladiator's Inner Life: 9 \nReads Like Part of a Larger Story: 8 \nAuthentic and Engrossing Ancient Roman Setting: 9 \nVivid and Immersive Sensory Details: 9 \nImagery and Descriptive Quality: 8 \nElegant Prose: 8 \nEmotionally Engaging: 9 \nEmotionally Complex: 8 \nCoherent: 10 \nAdherence to Instructions: 9 \nBelievable Character Actions: 9 \nNuanced Characters: 8 \nConsistent Voice/Tone of Writing: 9 \nMeandering: 1 \nWeak Dialogue: 2 \nTell-Don't-Show: 1 \nUnsurprising or Uncreative: 2 \nAmateurish: 1 \nPurple Prose: 2 \nOverwrought: 1 \nIncongruent Ending Positivity: 1 \nUnearned Transformations: 1 \nWell-earned Lightness or Darkness: 8 \nSentences Flow Naturally: 9 \nOverall Reader Engagement: 9 \nOverall Impression: 9 "
}
}
}
}
},
"c4337076-bb5e-4213-98ac-558fb7a65406__openai_gpt-4o-mini": {
"judge_model": "openai/gpt-4o-mini",
"start_time": "2025-07-23T17:15:02.537833",
"status": "running",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"results": {
"claude-3-5-sonnet-20240620": {
"1": {
"2": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:15:13.424622"
},
"6": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:15:23.701266"
},
"9": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:15:33.975049"
},
"10": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:15:44.290077"
},
"19": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:15:54.619007"
},
"20": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:16:04.990844"
}
}
}
},
"errors": [
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "2",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "6",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "9",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "10",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "19",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "20",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
}
]
},
"ae5dd791-be66-4ec7-a73e-22f9cb18273e__openai_gpt-4o-mini": {
"judge_model": "openai/gpt-4o-mini",
"start_time": "2025-07-23T17:17:04.826144",
"status": "running",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"results": {
"claude-3-5-sonnet-20240620": {
"1": {
"2": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:17:15.449268"
},
"6": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:17:25.869672"
},
"9": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:17:36.203090"
},
"10": {
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
"timestamp": "2025-07-23T17:17:46.855039"
}
}
}
},
"errors": [
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "2",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "6",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "9",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "10",
"error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
}
]
},
"62c4fff4-0382-47db-b419-d5acfb30ce1b__openai_gpt-4o-mini": {
"judge_model": "openai/gpt-4o-mini",
"start_time": "2025-07-23T17:18:48.064895",
"status": "running",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"results": {
"claude-3-5-sonnet-20240620": {
"1": {
"2": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:18:58.737280"
},
"6": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:19:09.036124"
},
"9": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:19:19.566282"
},
"10": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:19:29.887922"
},
"19": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:19:40.198755"
},
"20": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:19:50.559484"
},
"22": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:20:00.907318"
}
}
}
},
"errors": [
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "2",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "6",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "9",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "10",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "19",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "20",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "22",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
}
]
},
"31ffba42-2510-454b-8b48-4965f32b7b01__openai_gpt-4o-mini": {
"judge_model": "openai/gpt-4o-mini",
"start_time": "2025-07-23T17:20:50.781947",
"status": "running",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"results": {}
},
"5bb9aad6-8fc4-4b2d-aeea-c836534a110a__openai_gpt-4o-mini": {
"judge_model": "openai/gpt-4o-mini",
"start_time": "2025-07-23T17:20:56.195127",
"status": "running",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"results": {
"claude-3-5-sonnet-20240620": {
"1": {
"2": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:21:06.605472"
},
"6": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:21:17.026079"
},
"9": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:21:27.317695"
},
"10": {
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
"timestamp": "2025-07-23T17:21:37.664878"
}
}
}
},
"errors": [
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "2",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "6",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "9",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
},
{
"model": "claude-3-5-sonnet-20240620",
"iteration": "1",
"item_id": "10",
"error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
}
]
}
}
+1
View File
@@ -5,6 +5,7 @@ description = "**Judgemark V2** is a benchmark that evaluates how well a languag
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"loguru>=0.7.3",
"matplotlib>=3.7",
"python-dotenv>=1.1.1",
"scipy>=1.10",
Generated
+24
View File
@@ -527,6 +527,7 @@ name = "judgemark-v2lp"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "loguru" },
{ name = "matplotlib" },
{ name = "python-dotenv" },
{ name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -542,6 +543,7 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "loguru", specifier = ">=0.7.3" },
{ name = "matplotlib", specifier = ">=3.7" },
{ name = "python-dotenv", specifier = ">=1.1.1" },
{ name = "scipy", specifier = ">=1.10" },
@@ -680,6 +682,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3a/1d/50ad811d1c5dae091e4cf046beba925bcae0a610e79ae4c538f996f63ed5/kiwisolver-1.4.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:65ea09a5a3faadd59c2ce96dc7bf0f364986a315949dc6374f04396b0d60e09b", size = 71762, upload-time = "2024-12-24T18:30:48.903Z" },
]
[[package]]
name = "loguru"
version = "0.7.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "win32-setctime", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
]
[[package]]
name = "matplotlib"
version = "3.10.3"
@@ -1600,3 +1615,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/41/53/2e0253c5efd69c965
wheels = [
{ url = "https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl", hash = "sha256:4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575", size = 2196503, upload-time = "2025-04-10T13:01:23.086Z" },
]
[[package]]
name = "win32-setctime"
version = "1.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
]