From 9bf406768a5a84689bdf756996331dde70e0b9cf Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Wed, 23 Jul 2025 17:22:36 +0800 Subject: [PATCH] logger etc --- README.md | 26 +- judgemark-v2.py | 32 ++- judgemark_v2lp/benchmark.py | 131 +++++----- judgemark_v2lp/scoring.py | 20 +- judgemark_v2lp/separability.py | 16 +- judgemark_v2lp/stability.py | 14 +- judgemark_v2lp/utils/api.py | 28 ++- judgemark_v2lp/utils/file_io.py | 6 +- judgemark_v2lp/utils/logging_setup.py | 21 +- my_judgemark_runs.json | 329 ++++++++++++++++++++++++++ pyproject.toml | 1 + uv.lock | 24 ++ 12 files changed, 530 insertions(+), 118 deletions(-) create mode 100644 my_judgemark_runs.json diff --git a/README.md b/README.md index 91a1c6f..8eadfa2 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,28 @@ Fork of judgemark to see if using weighted logprob, or ranklogprob work better than the current method +Changes +- openrouters only +- get logprobs +- added options + - `--score-weighted` + - `--score-ranklog` +```bash +python judgemark_v2.py \ + --judge-model "openai/gpt-4o-mini" \ + --samples-file data/judgemark_v2.1_samples.json \ + --prompts-file data/judge_prompts.json \ + --runs-file my_judgemark_runs.json \ + --threads 1 \ + --num-runs 1 \ + --save-raw-judge-output +``` + +## Results + +TODO + ---- # Judgemark V2 @@ -23,14 +44,15 @@ The Judgemark leaderboard can be found here: [https://eqbench.com/judgemark-v2.h 1. **Clone the repository:** ```bash - git clone https://github.com/EQ-bench/Judgemark-v2.git + git clone https://github.com/wassname/Judgemark-v2lp.git cd Judgemark-v2 ``` 2. **Install Python dependencies** (make sure you’re on Python 3.9+): ```bash - pip install -r requirements.txt + uv sync + . ./venv/bin/activate # Activate the virtual environment ``` 3. **Set up environment variables** to include your judge model’s API credentials. For example, if you’re using OpenAI-compatible endpoints: diff --git a/judgemark-v2.py b/judgemark-v2.py index 1f938e0..ddec090 100644 --- a/judgemark-v2.py +++ b/judgemark-v2.py @@ -1,7 +1,7 @@ import sys import signal import argparse -import logging +from loguru import logger import time @@ -16,13 +16,13 @@ def signal_handler(signum, frame): """Handle interrupt signals (SIGINT, SIGTERM).""" global executor, should_exit print(f"\n[DEBUG] Signal {signum} caught!") - logging.warning("Signal handler called") + logger.warning("Signal handler called") should_exit = True time.sleep(0.1) # Give workers a moment to see the flag if executor: - logging.info("Shutting down executor from signal handler") + logger.info("Shutting down executor from signal handler") executor.shutdown(wait=False) - logging.info("Executor shutdown complete") + logger.info("Executor shutdown complete") sys.exit(1) def parse_args(): @@ -54,7 +54,7 @@ def parse_args(): parser.add_argument( '--threads', type=int, - default=6, + default=0, help='Number of threads to use' ) parser.add_argument( @@ -74,6 +74,18 @@ def parse_args(): default=False, help='If set, store the raw judge model output in the results JSON (default: false)' ) + parser.add_argument( + '--score-weighted', + action='store_true', + default=False, + help='If set, use weighted scoring for the judge model (default: false)' + ) + parser.add_argument( + '--score-ranked', + action='store_true', + default=False, + help='If set, use ranked logprob scoring for the judge model (default: false)' + ) return parser.parse_args() if __name__ == "__main__": @@ -90,18 +102,18 @@ if __name__ == "__main__": # Setup logging verbosity = get_verbosity(args.verbosity) setup_logging(verbosity) - logging.debug("Logging initialized") + logger.debug("Logging initialized") # Check that we have an API key for the judge model if not API_KEY: - logging.critical("No OPENAI_API_KEY found in environment variables.") + logger.critical("No OPENAI_API_KEY found in environment variables.") raise ValueError("OPENAI_API_KEY not found in environment variables.") run_ids = [] for i in range(1, args.num_runs + 1): if should_exit: break - logging.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}") + logger.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}") rid = run_judgemark_v2( judge_model=args.judge_model, samples_file=args.samples_file, @@ -115,11 +127,11 @@ if __name__ == "__main__": # Finally, print summary runs = load_json_file(args.runs_file) - logging.info("\nAll Judgemark-v2 runs completed:") + logger.info("\nAll Judgemark-v2 runs completed:") print("\nAll Judgemark-v2 runs completed:") for rid in run_ids: rd = runs.get(rid, {}) final_score = rd.get("final_judgemark_score", "N/A") - logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}") + logger.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}") print(f"Run ID: {rid}") print(f"Final Judgemark-v2 Score: {final_score}") diff --git a/judgemark_v2lp/benchmark.py b/judgemark_v2lp/benchmark.py index 72c31dc..791d32b 100644 --- a/judgemark_v2lp/benchmark.py +++ b/judgemark_v2lp/benchmark.py @@ -3,7 +3,7 @@ import re import uuid import time import signal -import logging +from loguru import logger import threading import concurrent.futures from datetime import datetime @@ -14,15 +14,15 @@ from collections import defaultdict from judgemark_v2lp.utils.file_io import load_json_file, save_json_file from judgemark_v2lp.utils.api import send_to_judge_model from judgemark_v2lp.utils.visualization import create_side_by_side_score_charts -from judgemark_v2lp.core.scoring import ( +from judgemark_v2lp.scoring import ( parse_scores, compute_raw_score, compute_detailed_distribution, compute_model_level_stats, compute_cross_model_stats, build_landmark_calibration_config, apply_landmark_calibration, log_score_summary, confidence_interval_95 ) -from judgemark_v2lp.core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc -from judgemark_v2lp.core.separability import compute_separability_metrics -from judgemark_v2lp.core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item +from judgemark_v2lp.scoring import compute_detailed_distribution, compute_detailed_distribution # etc +from judgemark_v2lp.separability import compute_separability_metrics +from judgemark_v2lp.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y from judgemark_v2lp.utils.state import should_exit, executor @@ -74,12 +74,12 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: save_json_file(runs, runs_file) if raw_score is not None: - logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}") + logger.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}") else: - logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}") + logger.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}") except Exception as e: - logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}") + logger.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}") with lock: iteration_dict[item_id] = { "error": str(e), @@ -214,10 +214,10 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data compute_iteration_stability(run_data, label="calibrated") random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000) random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000) - logging.info("Score stability (RAW)") - logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}") - logging.info("Score stability (CALIBRATED)") - logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} " + logger.info("Score stability (RAW)") + logger.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}") + logger.info("Score stability (CALIBRATED)") + logger.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} " f"({run_data['calibrated_cross_model_stats']['kendall_tau']})") # 9. Compute the final Judgemark scores (one using raw stats, one using calibrated) @@ -348,8 +348,8 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data run_data["calibrated_model_stats"] ) - logging.info(f"Final Judgemark (raw) = {final_score_raw:.3f}") - logging.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}") + logger.info(f"Final Judgemark (raw) = {final_score_raw:.3f}") + logger.info(f"Final Judgemark (cal) = {final_score_calibrated:.3f}") def sanitize_model_name(name: str) -> str: @@ -367,7 +367,7 @@ def run_judgemark_v2( ) -> str: global executor, should_exit - logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}") + logger.info(f"Starting Judgemark-v2 using judge model: {judge_model}") runs = load_json_file(runs_file) # Form the run key using run_id + "__" + sanitized judge model @@ -426,9 +426,9 @@ def run_judgemark_v2( }) if items_to_process: - logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}") + logger.info(f"Found {len(items_to_process)} items to process in existing run {run_key}") else: - logging.info(f"No items to process in existing run {run_key}") + logger.info(f"No items to process in existing run {run_key}") else: # New run - process all items @@ -448,47 +448,66 @@ def run_judgemark_v2( # Ensure concurrency lock lock = threading.Lock() - # Process any items that need retrying - all_futures = [] try: - with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_: - executor = exec_ - - if items_to_process: - # Process all items (either retries or new run) - for item in items_to_process: - if should_exit: - break - - all_futures.append( - executor.submit( - process_sample, - item["model_name"], - item["iteration_key"], - item["item_id"], - item["item_text"], - item["prompt_template"], - run_key, - runs, - runs_file, - lock, - judge_model, - save_raw_judge_output - ) - ) + if num_threads <= 1: + # Single-threaded mode + for item in items_to_process: + if should_exit: + break + process_sample( + item["model_name"], + item["iteration_key"], + item["item_id"], + item["item_text"], + item["prompt_template"], + run_key, + runs, + runs_file, + lock, + judge_model, + save_raw_judge_output + ) + else: + # Process any items that need retrying + all_futures = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_: + executor = exec_ - # Display progress bar for tasks - for f in tqdm(concurrent.futures.as_completed(all_futures), - total=len(all_futures), desc="Judging", leave=True): - if should_exit: - break - try: - f.result() - except Exception as exc: - logging.error(f"Exception in worker thread: {exc}") + if items_to_process: + # Process all items (either retries or new run) + for item in items_to_process: + if should_exit: + break + + all_futures.append( + executor.submit( + process_sample, + item["model_name"], + item["iteration_key"], + item["item_id"], + item["item_text"], + item["prompt_template"], + run_key, + runs, + runs_file, + lock, + judge_model, + save_raw_judge_output + ) + ) + + # Display progress bar for tasks + for f in tqdm(concurrent.futures.as_completed(all_futures), + total=len(all_futures), desc="Judging", leave=True): + if should_exit: + break + try: + f.result() + except Exception as exc: + logger.error(f"Exception in worker thread: {exc}") except KeyboardInterrupt: - logging.warning("KeyboardInterrupt caught in main thread.") + logger.warning("KeyboardInterrupt caught in main thread.") should_exit = True time.sleep(0.1) finally: @@ -513,9 +532,9 @@ def run_judgemark_v2( save_json_file(runs, runs_file) if executor: - logging.info("Shutting down executor") + logger.info("Shutting down executor") executor.shutdown(wait=False) executor = None - logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}") + logger.info(f"Judgemark-v2 run {run_key} ended with status: {status}") return run_key diff --git a/judgemark_v2lp/scoring.py b/judgemark_v2lp/scoring.py index 9cebcc6..9a5ba7c 100644 --- a/judgemark_v2lp/scoring.py +++ b/judgemark_v2lp/scoring.py @@ -2,7 +2,7 @@ import math import statistics import numpy as np import scipy.stats -import logging +from loguru import logger from typing import Dict, List import re from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES @@ -256,14 +256,14 @@ def apply_landmark_calibration(x, config): def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict): """Log a readable summary of score statistics.""" - logging.info(f"\n------- {score_type} Summary -------") - logging.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}") - logging.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}") - logging.info(f"Pearson r={cross_stats['pearson_r']:.4f}") - logging.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}") - logging.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}") + logger.info(f"\n------- {score_type} Summary -------") + logger.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}") + logger.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}") + logger.info(f"Pearson r={cross_stats['pearson_r']:.4f}") + logger.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}") + logger.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}") - logging.info("\nModel Scores:") + logger.info("\nModel Scores:") sorted_models = sorted( model_stats.items(), key=lambda kv: kv[1]["mean"], @@ -271,5 +271,5 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict): ) for model, stats in sorted_models: line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}" - logging.info(line) - logging.info("------------------------------------") + logger.info(line) + logger.info("------------------------------------") diff --git a/judgemark_v2lp/separability.py b/judgemark_v2lp/separability.py index d027166..320eb7d 100644 --- a/judgemark_v2lp/separability.py +++ b/judgemark_v2lp/separability.py @@ -1,5 +1,5 @@ -import logging +from loguru import logger import math import statistics import numpy as np @@ -296,13 +296,13 @@ def compute_separability_metrics( metrics_label["modulated_ci95"] = modulated_ci95 # Logging summary - logging.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---") - logging.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}") - logging.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): " + logger.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---") + logger.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}") + logger.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): " f"{sum_overlap_magnitude:.3f}") - logging.info(f"CI99 Overlap pct: " + logger.info(f"CI99 Overlap pct: " f"{ci99_overlap_percentage_adjacent_avg:.3f}") - logging.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}") - logging.info(f"Average EMD across all pairs: {emd_data['average']:.3f}") - logging.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})") + logger.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}") + logger.info(f"Average EMD across all pairs: {emd_data['average']:.3f}") + logger.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})") diff --git a/judgemark_v2lp/stability.py b/judgemark_v2lp/stability.py index a28f1ef..d0d1e3e 100644 --- a/judgemark_v2lp/stability.py +++ b/judgemark_v2lp/stability.py @@ -1,4 +1,4 @@ -import logging +from loguru import logger import concurrent.futures from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm @@ -368,12 +368,12 @@ def process_stability_test_item(model_name, iteration_key, item_id, item_text, p # Only return actual valid scores, never None return item_score if isinstance(item_score, (int, float)) and item_score > 0.0 else 0.0 except Exception as e: - logging.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}") + logger.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}") return 0.0 def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, runs_file, lock, num_threads): """Run stability test, retrying any missing entries to reach STABILITY_REPS per item.""" - logging.info("Running stability test for selected items...") + logger.info("Running stability test for selected items...") if "stability_test_results" not in run_data: run_data["stability_test_results"] = {} @@ -402,13 +402,13 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, "key_name": key_name }) - logging.info(f"Need {needed_count} more stability test results for {key_name}") + logger.info(f"Need {needed_count} more stability test results for {key_name}") # Clean up existing results, keeping only valid scores run_data["stability_test_results"][key_name] = valid_results if not items_to_process: - logging.info("All stability test items already have complete results") + logger.info("All stability test items already have complete results") return with ThreadPoolExecutor(max_workers=num_threads) as exec_: @@ -443,6 +443,6 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, run_data["stability_test_results"][key_name].append(score) save_json_file(runs, runs_file) else: - logging.warning(f"Got invalid score for stability item {key_name}, will need retry") + logger.warning(f"Got invalid score for stability item {key_name}, will need retry") except Exception as exc: - logging.error(f"Exception in stability test: {exc}") + logger.error(f"Exception in stability test: {exc}") diff --git a/judgemark_v2lp/utils/api.py b/judgemark_v2lp/utils/api.py index a4b6167..d1e1bf2 100644 --- a/judgemark_v2lp/utils/api.py +++ b/judgemark_v2lp/utils/api.py @@ -1,6 +1,6 @@ import os import time -import logging +from loguru import logger import requests from typing import List, Dict from dotenv import load_dotenv @@ -32,23 +32,37 @@ def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int "temperature": 0.5, "top_k": 3, "max_tokens": 8096, - #"provider": { + + "logprobs": True, + "top_logprobs": 20, + + ## openrouter specific + "provider": { + "require_parameters": True, # "order": [ # "DeepSeek", # "DeepInfra", # "Nebius" # ], # "allow_fallbacks": False - #} + }, + "usage": {"include": True}, } response = requests.post(BASE_URL, headers=HEADERS, json=data) - response.raise_for_status() res_json = response.json() + if "error" in res_json: + raise requests.exceptions.HTTPError(res_json['error']) + response.raise_for_status() return res_json['choices'][0]['message']['content'] except Exception as e: - logging.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}") + try: + logger.debug(response.text) + except: + pass + logger.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}") + # TODO print response header or erro if attempt == max_retries: - logging.critical(f"Max retries reached for judge model {judge_model}") + logger.critical(f"Max retries reached for judge model {judge_model}") raise time.sleep(RETRY_DELAY) - return "" \ No newline at end of file + return "" diff --git a/judgemark_v2lp/utils/file_io.py b/judgemark_v2lp/utils/file_io.py index e497891..5d6336c 100644 --- a/judgemark_v2lp/utils/file_io.py +++ b/judgemark_v2lp/utils/file_io.py @@ -1,5 +1,5 @@ import json -import logging +from loguru import logger def load_json_file(file_path: str) -> dict: """Loads a JSON file (returns empty if not found).""" @@ -7,11 +7,11 @@ def load_json_file(file_path: str) -> dict: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: - logging.warning(f"File {file_path} not found, returning empty dict.") + logger.warning(f"File {file_path} not found, returning empty dict.") return {} def save_json_file(data: dict, file_path: str): """Saves a dict to disk as JSON.""" with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) - logging.debug(f"Saved JSON data to {file_path}") \ No newline at end of file + logger.debug(f"Saved JSON data to {file_path}") diff --git a/judgemark_v2lp/utils/logging_setup.py b/judgemark_v2lp/utils/logging_setup.py index 9dda406..2246925 100644 --- a/judgemark_v2lp/utils/logging_setup.py +++ b/judgemark_v2lp/utils/logging_setup.py @@ -1,24 +1,15 @@ import os -import logging +import sys +# from loguru import logger +from loguru import logger def setup_logging(verbosity: str): """Set up logging based on verbosity level.""" - log_levels = { - 'DEBUG': logging.DEBUG, - 'INFO': logging.INFO, - 'WARNING': logging.WARNING, - 'ERROR': logging.ERROR, - 'CRITICAL': logging.CRITICAL - } - log_level = log_levels.get(verbosity.upper(), logging.INFO) - logging.basicConfig( - level=log_level, - format='%(asctime)s - %(levelname)s - %(threadName)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' - ) + logger.remove() + logger.add(sys.stderr, level=verbosity) def get_verbosity(args_verbosity: str) -> str: """Determine the verbosity level from command-line or environment.""" if args_verbosity: return args_verbosity - return os.getenv("LOG_VERBOSITY", "INFO") \ No newline at end of file + return os.getenv("LOG_VERBOSITY", "INFO") diff --git a/my_judgemark_runs.json b/my_judgemark_runs.json new file mode 100644 index 0000000..e683e6f --- /dev/null +++ b/my_judgemark_runs.json @@ -0,0 +1,329 @@ +{ + "8fd38fec-5b00-4199-a74f-db423a762d8f__openai_gpt-4o-mini": { + "judge_model": "openai/gpt-4o-mini", + "start_time": "2025-07-23T17:11:03.844488", + "status": "running", + "samples_file": "data/judgemark_v2.1_samples.json", + "prompts_file": "data/judge_prompts.json", + "results": { + "claude-3-5-sonnet-20240620": { + "1": { + "2": { + "parsed_scores": { + "Original; Not Derivative": 8.0, + "Meaningful Integration of Political and Social Context": 9.0, + "Nuanced and Insightful Portrayal of Gladiator's Inner Life": 9.0, + "Reads Like Part of a Larger Story": 8.0, + "Authentic and Engrossing Ancient Roman Setting": 9.0, + "Vivid and Immersive Sensory Details": 9.0, + "Imagery and Descriptive Quality": 8.0, + "Elegant Prose": 8.0, + "Emotionally Engaging": 9.0, + "Emotionally Complex": 8.0, + "Coherent": 10.0, + "Adherence to Instructions": 9.0, + "Believable Character Actions": 9.0, + "Nuanced Characters": 8.0, + "Consistent Voice/Tone of Writing": 9.0, + "Meandering": 1.0, + "Weak Dialogue": 2.0, + "Tell-Don't-Show": 1.0, + "Unsurprising or Uncreative": 2.0, + "Amateurish": 1.0, + "Purple Prose": 2.0, + "Overwrought": 1.0, + "Incongruent Ending Positivity": 1.0, + "Unearned Transformations": 1.0, + "Well-earned Lightness or Darkness": 8.0, + "Sentences Flow Naturally": 9.0, + "Overall Reader Engagement": 9.0, + "Overall Impression": 9.0 + }, + "timestamp": "2025-07-23T17:13:06.569955", + "text_length": 5262, + "aggregated_score_raw": 8.68, + "judge_response": "[Analysis]\n\nThe test model's response offers a vivid and immersive portrayal of a day in the life of a gladiator, skillfully integrating sensory details to create an authentic ancient Roman setting. The narrative captures the protagonist's physical and emotional struggles, reflecting on the larger social and political issues of the time, such as the disparity between the patricians and the plebeians. The inner life of the gladiator is nuanced and insightful, revealing a deep sense of resentment and longing for freedom, which adds complexity to the character.\n\nThe writing reads cohesively as part of a larger story, with a clear trajectory that hints at future developments in the protagonist's journey. The sensory details are particularly strong, with effective imagery that engages the reader's senses and brings the setting to life. The prose is generally elegant, with a good command of language that avoids excessive ornamentation, although there are moments where the description could be trimmed for tighter pacing.\n\nEmotionally, the piece resonates well, as it explores themes of identity, struggle, and the desire for agency, making it relatable and compelling. The character's actions and reactions feel believable and consistent with his circumstances, and the voice throughout remains steady and appropriate for the setting.\n\nWhile the dialogue is minimal, it serves its purpose, providing insights into character relationships and the gladiatorial world without feeling forced. The narrative does not meander, maintaining focus on the protagonist's experiences and thoughts. There are no instances of unearned positivity or transformations, and the ending feels congruent with the overall tone of the piece.\n\nOverall, the response is engaging and well-crafted, successfully meeting the prompt's requirements while delivering a strong emotional and thematic impact.\n\n[Scores]\n\nOriginal; Not Derivative: 8 \nMeaningful Integration of Political and Social Context: 9 \nNuanced and Insightful Portrayal of Gladiator's Inner Life: 9 \nReads Like Part of a Larger Story: 8 \nAuthentic and Engrossing Ancient Roman Setting: 9 \nVivid and Immersive Sensory Details: 9 \nImagery and Descriptive Quality: 8 \nElegant Prose: 8 \nEmotionally Engaging: 9 \nEmotionally Complex: 8 \nCoherent: 10 \nAdherence to Instructions: 9 \nBelievable Character Actions: 9 \nNuanced Characters: 8 \nConsistent Voice/Tone of Writing: 9 \nMeandering: 1 \nWeak Dialogue: 2 \nTell-Don't-Show: 1 \nUnsurprising or Uncreative: 2 \nAmateurish: 1 \nPurple Prose: 2 \nOverwrought: 1 \nIncongruent Ending Positivity: 1 \nUnearned Transformations: 1 \nWell-earned Lightness or Darkness: 8 \nSentences Flow Naturally: 9 \nOverall Reader Engagement: 9 \nOverall Impression: 9 " + } + } + } + } + }, + "c4337076-bb5e-4213-98ac-558fb7a65406__openai_gpt-4o-mini": { + "judge_model": "openai/gpt-4o-mini", + "start_time": "2025-07-23T17:15:02.537833", + "status": "running", + "samples_file": "data/judgemark_v2.1_samples.json", + "prompts_file": "data/judge_prompts.json", + "results": { + "claude-3-5-sonnet-20240620": { + "1": { + "2": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:15:13.424622" + }, + "6": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:15:23.701266" + }, + "9": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:15:33.975049" + }, + "10": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:15:44.290077" + }, + "19": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:15:54.619007" + }, + "20": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:16:04.990844" + } + } + } + }, + "errors": [ + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "2", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "6", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "9", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "10", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "19", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "20", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + } + ] + }, + "ae5dd791-be66-4ec7-a73e-22f9cb18273e__openai_gpt-4o-mini": { + "judge_model": "openai/gpt-4o-mini", + "start_time": "2025-07-23T17:17:04.826144", + "status": "running", + "samples_file": "data/judgemark_v2.1_samples.json", + "prompts_file": "data/judge_prompts.json", + "results": { + "claude-3-5-sonnet-20240620": { + "1": { + "2": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:17:15.449268" + }, + "6": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:17:25.869672" + }, + "9": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:17:36.203090" + }, + "10": { + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions", + "timestamp": "2025-07-23T17:17:46.855039" + } + } + } + }, + "errors": [ + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "2", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "6", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "9", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "10", + "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions" + } + ] + }, + "62c4fff4-0382-47db-b419-d5acfb30ce1b__openai_gpt-4o-mini": { + "judge_model": "openai/gpt-4o-mini", + "start_time": "2025-07-23T17:18:48.064895", + "status": "running", + "samples_file": "data/judgemark_v2.1_samples.json", + "prompts_file": "data/judge_prompts.json", + "results": { + "claude-3-5-sonnet-20240620": { + "1": { + "2": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:18:58.737280" + }, + "6": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:19:09.036124" + }, + "9": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:19:19.566282" + }, + "10": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:19:29.887922" + }, + "19": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:19:40.198755" + }, + "20": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:19:50.559484" + }, + "22": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:20:00.907318" + } + } + } + }, + "errors": [ + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "2", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "6", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "9", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "10", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "19", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "20", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "22", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + } + ] + }, + "31ffba42-2510-454b-8b48-4965f32b7b01__openai_gpt-4o-mini": { + "judge_model": "openai/gpt-4o-mini", + "start_time": "2025-07-23T17:20:50.781947", + "status": "running", + "samples_file": "data/judgemark_v2.1_samples.json", + "prompts_file": "data/judge_prompts.json", + "results": {} + }, + "5bb9aad6-8fc4-4b2d-aeea-c836534a110a__openai_gpt-4o-mini": { + "judge_model": "openai/gpt-4o-mini", + "start_time": "2025-07-23T17:20:56.195127", + "status": "running", + "samples_file": "data/judgemark_v2.1_samples.json", + "prompts_file": "data/judge_prompts.json", + "results": { + "claude-3-5-sonnet-20240620": { + "1": { + "2": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:21:06.605472" + }, + "6": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:21:17.026079" + }, + "9": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:21:27.317695" + }, + "10": { + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}", + "timestamp": "2025-07-23T17:21:37.664878" + } + } + } + }, + "errors": [ + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "2", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "6", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "9", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + }, + { + "model": "claude-3-5-sonnet-20240620", + "iteration": "1", + "item_id": "10", + "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}" + } + ] + } +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bbfae6b..13b2a56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "**Judgemark V2** is a benchmark that evaluates how well a languag readme = "README.md" requires-python = ">=3.10" dependencies = [ + "loguru>=0.7.3", "matplotlib>=3.7", "python-dotenv>=1.1.1", "scipy>=1.10", diff --git a/uv.lock b/uv.lock index 357ec97..2fd78af 100644 --- a/uv.lock +++ b/uv.lock @@ -527,6 +527,7 @@ name = "judgemark-v2lp" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "loguru" }, { name = "matplotlib" }, { name = "python-dotenv" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -542,6 +543,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "loguru", specifier = ">=0.7.3" }, { name = "matplotlib", specifier = ">=3.7" }, { name = "python-dotenv", specifier = ">=1.1.1" }, { name = "scipy", specifier = ">=1.10" }, @@ -680,6 +682,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/1d/50ad811d1c5dae091e4cf046beba925bcae0a610e79ae4c538f996f63ed5/kiwisolver-1.4.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:65ea09a5a3faadd59c2ce96dc7bf0f364986a315949dc6374f04396b0d60e09b", size = 71762, upload-time = "2024-12-24T18:30:48.903Z" }, ] +[[package]] +name = "loguru" +version = "0.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "win32-setctime", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, +] + [[package]] name = "matplotlib" version = "3.10.3" @@ -1600,3 +1615,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/41/53/2e0253c5efd69c965 wheels = [ { url = "https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl", hash = "sha256:4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575", size = 2196503, upload-time = "2025-04-10T13:01:23.086Z" }, ] + +[[package]] +name = "win32-setctime" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, +]