logger etc

2026-06-27 16:10:14 +08:00 · 2025-07-23 17:22:36 +08:00
parent 5615c809e1
commit 9bf406768a
12 changed files with 530 additions and 118 deletions
@@ -1,7 +1,28 @@
 Fork of judgemark to see if using weighted logprob, or ranklogprob work better than the current method

+Changes
+- openrouters only
+- get logprobs
+- added options
+  - `--score-weighted`
+  - `--score-ranklog`


+```bash
+python judgemark_v2.py \
+  --judge-model "openai/gpt-4o-mini" \
+  --samples-file data/judgemark_v2.1_samples.json \
+  --prompts-file data/judge_prompts.json \
+  --runs-file my_judgemark_runs.json \
+  --threads 1 \
+  --num-runs 1 \
+  --save-raw-judge-output
+```
+
+## Results
+
+TODO
+
 ----

 # Judgemark V2
@@ -23,14 +44,15 @@ The Judgemark leaderboard can be found here: [https://eqbench.com/judgemark-v2.h
 1. **Clone the repository:**

   ```bash
-   git clone https://github.com/EQ-bench/Judgemark-v2.git
+   git clone https://github.com/wassname/Judgemark-v2lp.git
   cd Judgemark-v2
   ```

 2. **Install Python dependencies** (make sure you’re on Python 3.9+):

   ```bash
-   pip install -r requirements.txt
+   uv sync
+   . ./venv/bin/activate  # Activate the virtual environment
   ```

 3. **Set up environment variables** to include your judge model’s API credentials. For example, if you’re using OpenAI-compatible endpoints:
@@ -1,7 +1,7 @@
 import sys
 import signal
 import argparse
-import logging
+from loguru import logger
 import time


@@ -16,13 +16,13 @@ def signal_handler(signum, frame):
    """Handle interrupt signals (SIGINT, SIGTERM)."""
    global executor, should_exit
    print(f"\n[DEBUG] Signal {signum} caught!")
-    logging.warning("Signal handler called")
+    logger.warning("Signal handler called")
    should_exit = True
    time.sleep(0.1)  # Give workers a moment to see the flag
    if executor:
-        logging.info("Shutting down executor from signal handler")
+        logger.info("Shutting down executor from signal handler")
        executor.shutdown(wait=False)
-        logging.info("Executor shutdown complete")
+        logger.info("Executor shutdown complete")
    sys.exit(1)

 def parse_args():
@@ -54,7 +54,7 @@ def parse_args():
    parser.add_argument(
        '--threads',
        type=int,
-        default=6,
+        default=0,
        help='Number of threads to use'
    )
    parser.add_argument(
@@ -74,6 +74,18 @@ def parse_args():
        default=False,
        help='If set, store the raw judge model output in the results JSON (default: false)'
    )
+    parser.add_argument(
+        '--score-weighted',
+        action='store_true',
+        default=False,
+        help='If set, use weighted scoring for the judge model (default: false)'
+    )
+    parser.add_argument(
+        '--score-ranked',
+        action='store_true',
+        default=False,
+        help='If set, use ranked logprob scoring for the judge model (default: false)'
+    )
    return parser.parse_args()

 if __name__ == "__main__":
@@ -90,18 +102,18 @@ if __name__ == "__main__":
    # Setup logging
    verbosity = get_verbosity(args.verbosity)
    setup_logging(verbosity)
-    logging.debug("Logging initialized")
+    logger.debug("Logging initialized")
    
    # Check that we have an API key for the judge model
    if not API_KEY:
-        logging.critical("No OPENAI_API_KEY found in environment variables.")
+        logger.critical("No OPENAI_API_KEY found in environment variables.")
        raise ValueError("OPENAI_API_KEY not found in environment variables.")
    
    run_ids = []
    for i in range(1, args.num_runs + 1):
        if should_exit:
            break
-        logging.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}")
+        logger.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}")
        rid = run_judgemark_v2(
            judge_model=args.judge_model,
            samples_file=args.samples_file,
@@ -115,11 +127,11 @@ if __name__ == "__main__":
    
    # Finally, print summary
    runs = load_json_file(args.runs_file)
-    logging.info("\nAll Judgemark-v2 runs completed:")
+    logger.info("\nAll Judgemark-v2 runs completed:")
    print("\nAll Judgemark-v2 runs completed:")
    for rid in run_ids:
        rd = runs.get(rid, {})
        final_score = rd.get("final_judgemark_score", "N/A")
-        logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
+        logger.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
        print(f"Run ID: {rid}")
        print(f"Final Judgemark-v2 Score: {final_score}")
@@ -3,7 +3,7 @@ import re
 import uuid
 import time
 import signal
-import logging
+from loguru import logger
 import threading
 import concurrent.futures
 from datetime import datetime
@@ -14,15 +14,15 @@ from collections import defaultdict
 from judgemark_v2lp.utils.file_io import load_json_file, save_json_file
 from judgemark_v2lp.utils.api import send_to_judge_model
 from judgemark_v2lp.utils.visualization import create_side_by_side_score_charts
-from judgemark_v2lp.core.scoring import (
+from judgemark_v2lp.scoring import (
    parse_scores, compute_raw_score, compute_detailed_distribution,
    compute_model_level_stats, compute_cross_model_stats,
    build_landmark_calibration_config, apply_landmark_calibration,
    log_score_summary, confidence_interval_95
 )
-from judgemark_v2lp.core.scoring import compute_detailed_distribution, compute_detailed_distribution  # etc
-from judgemark_v2lp.core.separability import compute_separability_metrics
-from judgemark_v2lp.core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
+from judgemark_v2lp.scoring import compute_detailed_distribution, compute_detailed_distribution  # etc
+from judgemark_v2lp.separability import compute_separability_metrics
+from judgemark_v2lp.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
 from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y
 from judgemark_v2lp.utils.state import should_exit, executor

@@ -74,12 +74,12 @@ def process_sample(model_name: str, iteration_key: str, item_id: str, item_text:
            save_json_file(runs, runs_file)
        
        if raw_score is not None:
-            logging.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
+            logger.debug(f"Processed {model_name}/{iteration_key}/{item_id}, raw score: {raw_score:.2f}")
        else:
-            logging.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
+            logger.warning(f"Failed to parse enough scores for {model_name}/{iteration_key}/{item_id}")
            
    except Exception as e:
-        logging.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
+        logger.error(f"Error processing item {model_name}/{iteration_key}/{item_id}: {str(e)}")
        with lock:
            iteration_dict[item_id] = {
                "error": str(e),
@@ -214,10 +214,10 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
    compute_iteration_stability(run_data, label="calibrated")
    random_tau_raw = compute_randomized_iteration_rank_stability_by_item(run_data, label="raw", n_shuffles=1000)
    random_tau_cal = compute_randomized_iteration_rank_stability_by_item(run_data, label="calibrated", n_shuffles=1000)
-    logging.info("Score stability (RAW)")
-    logging.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
-    logging.info("Score stability (CALIBRATED)") 
-    logging.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
+    logger.info("Score stability (RAW)")
+    logger.info(f"Randomized average Kendall's tau (raw): {random_tau_raw:.3f}")
+    logger.info("Score stability (CALIBRATED)") 
+    logger.info(f"Randomized average Kendall's tau (calibrated): {random_tau_cal:.3f} "
                 f"({run_data['calibrated_cross_model_stats']['kendall_tau']})")

    # 9. Compute the final Judgemark scores (one using raw stats, one using calibrated)
@@ -348,8 +348,8 @@ def finalize_scores_and_compute_judgemark(runs: dict, run_key: str, samples_data
        run_data["calibrated_model_stats"]
    )

-    logging.info(f"Final Judgemark (raw)   = {final_score_raw:.3f}")
-    logging.info(f"Final Judgemark (cal)  = {final_score_calibrated:.3f}")
+    logger.info(f"Final Judgemark (raw)   = {final_score_raw:.3f}")
+    logger.info(f"Final Judgemark (cal)  = {final_score_calibrated:.3f}")


 def sanitize_model_name(name: str) -> str:
@@ -367,7 +367,7 @@ def run_judgemark_v2(
 ) -> str:
    global executor, should_exit
    
-    logging.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
+    logger.info(f"Starting Judgemark-v2 using judge model: {judge_model}")
    runs = load_json_file(runs_file)
    
    # Form the run key using run_id + "__" + sanitized judge model
@@ -426,9 +426,9 @@ def run_judgemark_v2(
                        })
        
        if items_to_process:
-            logging.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
+            logger.info(f"Found {len(items_to_process)} items to process in existing run {run_key}")
        else:
-            logging.info(f"No items to process in existing run {run_key}")
+            logger.info(f"No items to process in existing run {run_key}")

    else:
        # New run - process all items
@@ -448,47 +448,66 @@ def run_judgemark_v2(
    # Ensure concurrency lock
    lock = threading.Lock()
    
-    # Process any items that need retrying
-    all_futures = []
    try:
-        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
-            executor = exec_
-            
-            if items_to_process:
-                # Process all items (either retries or new run)
-                for item in items_to_process:
-                    if should_exit:
-                        break
-                        
-                    all_futures.append(
-                        executor.submit(
-                            process_sample,
-                            item["model_name"],
-                            item["iteration_key"],
-                            item["item_id"],
-                            item["item_text"],
-                            item["prompt_template"],
-                            run_key,
-                            runs,
-                            runs_file,
-                            lock,
-                            judge_model,
-                            save_raw_judge_output
-                        )
-                    )
+        if num_threads <= 1:
+            # Single-threaded mode
+            for item in items_to_process:
+                if should_exit:
+                    break
+                process_sample(
+                    item["model_name"],
+                    item["iteration_key"],
+                    item["item_id"],
+                    item["item_text"],
+                    item["prompt_template"],
+                    run_key,
+                    runs,
+                    runs_file,
+                    lock,
+                    judge_model,
+                    save_raw_judge_output
+                )
+        else:
+            # Process any items that need retrying
+            all_futures = []
+            with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as exec_:
+                executor = exec_
                
-                # Display progress bar for tasks
-                for f in tqdm(concurrent.futures.as_completed(all_futures), 
-                              total=len(all_futures), desc="Judging", leave=True):
-                    if should_exit:
-                        break
-                    try:
-                        f.result()
-                    except Exception as exc:
-                        logging.error(f"Exception in worker thread: {exc}")
+                if items_to_process:
+                    # Process all items (either retries or new run)
+                    for item in items_to_process:
+                        if should_exit:
+                            break
+                            
+                        all_futures.append(
+                            executor.submit(
+                                process_sample,
+                                item["model_name"],
+                                item["iteration_key"],
+                                item["item_id"],
+                                item["item_text"],
+                                item["prompt_template"],
+                                run_key,
+                                runs,
+                                runs_file,
+                                lock,
+                                judge_model,
+                                save_raw_judge_output
+                            )
+                        )
+                    
+                    # Display progress bar for tasks
+                    for f in tqdm(concurrent.futures.as_completed(all_futures), 
+                                total=len(all_futures), desc="Judging", leave=True):
+                        if should_exit:
+                            break
+                        try:
+                            f.result()
+                        except Exception as exc:
+                            logger.error(f"Exception in worker thread: {exc}")
    
    except KeyboardInterrupt:
-        logging.warning("KeyboardInterrupt caught in main thread.")
+        logger.warning("KeyboardInterrupt caught in main thread.")
        should_exit = True
        time.sleep(0.1)
    finally:
@@ -513,9 +532,9 @@ def run_judgemark_v2(
        save_json_file(runs, runs_file)
        
        if executor:
-            logging.info("Shutting down executor")
+            logger.info("Shutting down executor")
            executor.shutdown(wait=False)
            executor = None
    
-    logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
+    logger.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
    return run_key
@@ -2,7 +2,7 @@ import math
 import statistics
 import numpy as np
 import scipy.stats
-import logging
+from loguru import logger
 from typing import Dict, List
 import re
 from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES
@@ -256,14 +256,14 @@ def apply_landmark_calibration(x, config):

 def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
    """Log a readable summary of score statistics."""
-    logging.info(f"\n------- {score_type} Summary -------")
-    logging.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
-    logging.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
-    logging.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
-    logging.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
-    logging.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
+    logger.info(f"\n------- {score_type} Summary -------")
+    logger.info(f"ANOVA F-value: {cross_stats['anova_f']:.4f}, p={cross_stats['anova_p']:.4f}")
+    logger.info(f"Kruskal-Wallis: {cross_stats['kw_stat']:.4f}, p={cross_stats['kw_p']:.4f}")
+    logger.info(f"Pearson r={cross_stats['pearson_r']:.4f}")
+    logger.info(f"Kendall τ={cross_stats['kendall_tau']:.4f}")
+    logger.info(f"Std.Dev across models: {cross_stats['std_dev_across_models']:.4f}")
    
-    logging.info("\nModel Scores:")
+    logger.info("\nModel Scores:")
    sorted_models = sorted(
        model_stats.items(),
        key=lambda kv: kv[1]["mean"],
@@ -271,5 +271,5 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
    )
    for model, stats in sorted_models:
        line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
-        logging.info(line)
-    logging.info("------------------------------------")
+        logger.info(line)
+    logger.info("------------------------------------")
@@ -1,5 +1,5 @@

-import logging
+from loguru import logger
 import math
 import statistics
 import numpy as np
@@ -296,13 +296,13 @@ def compute_separability_metrics(
    metrics_label["modulated_ci95"] = modulated_ci95    

    # Logging summary
-    logging.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
-    logging.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
-    logging.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
+    logger.info(f"\n--- {label.upper()} SEPARABILITY METRICS ---")
+    logger.info(f"Adjacent 99% CI Overlap fraction: {adj_frac_overlap:.3f}")
+    logger.info(f"Sum of adjacent 99% CI Overlap magnitude (scale={scale_factor}): "
                 f"{sum_overlap_magnitude:.3f}")
-    logging.info(f"CI99 Overlap pct: "
+    logger.info(f"CI99 Overlap pct: "
                 f"{ci99_overlap_percentage_adjacent_avg:.3f}")
    
-    logging.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
-    logging.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
-    logging.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
+    logger.info(f"Avg. |Cohen's d| for adjacent pairs: {avg_cohens_d:.3f}")
+    logger.info(f"Average EMD across all pairs: {emd_data['average']:.3f}")
+    logger.info(f"Avg. CI95 half-width: {avg_ci95:.3f} (modulated: {modulated_ci95:.3f})")
@@ -1,4 +1,4 @@
-import logging
+from loguru import logger
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 from tqdm import tqdm
@@ -368,12 +368,12 @@ def process_stability_test_item(model_name, iteration_key, item_id, item_text, p
        # Only return actual valid scores, never None
        return item_score if isinstance(item_score, (int, float)) and item_score > 0.0 else 0.0
    except Exception as e:
-        logging.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}")
+        logger.error(f"Error in stability test item {model_name}/{iteration_key}/{item_id}: {str(e)}")
        return 0.0

 def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs, runs_file, lock, num_threads):
    """Run stability test, retrying any missing entries to reach STABILITY_REPS per item."""
-    logging.info("Running stability test for selected items...")
+    logger.info("Running stability test for selected items...")
    
    if "stability_test_results" not in run_data:
        run_data["stability_test_results"] = {}
@@ -402,13 +402,13 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs,
                    "key_name": key_name
                })
            
-            logging.info(f"Need {needed_count} more stability test results for {key_name}")
+            logger.info(f"Need {needed_count} more stability test results for {key_name}")
            
            # Clean up existing results, keeping only valid scores
            run_data["stability_test_results"][key_name] = valid_results
    
    if not items_to_process:
-        logging.info("All stability test items already have complete results")
+        logger.info("All stability test items already have complete results")
        return
    
    with ThreadPoolExecutor(max_workers=num_threads) as exec_:
@@ -443,6 +443,6 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs,
                        run_data["stability_test_results"][key_name].append(score)
                        save_json_file(runs, runs_file)
                else:
-                    logging.warning(f"Got invalid score for stability item {key_name}, will need retry")
+                    logger.warning(f"Got invalid score for stability item {key_name}, will need retry")
            except Exception as exc:
-                logging.error(f"Exception in stability test: {exc}")
+                logger.error(f"Exception in stability test: {exc}")
@@ -1,6 +1,6 @@
 import os
 import time
-import logging
+from loguru import logger
 import requests
 from typing import List, Dict
 from dotenv import load_dotenv
@@ -32,23 +32,37 @@ def send_to_judge_model(messages: List[Dict], judge_model: str, max_retries: int
                "temperature": 0.5,
                "top_k": 3,
                "max_tokens": 8096,
-                #"provider": {
+
+                "logprobs": True,
+                "top_logprobs": 20,
+
+                ## openrouter specific
+                "provider": {
+                    "require_parameters": True, 
                #    "order": [
                #        "DeepSeek",
                #        "DeepInfra",
                #        "Nebius"
                #   ],
                #   "allow_fallbacks": False
-                #}
+                },
+                "usage": {"include": True},
            }
            response = requests.post(BASE_URL, headers=HEADERS, json=data)
-            response.raise_for_status()
            res_json = response.json()
+            if "error" in res_json:
+                raise requests.exceptions.HTTPError(res_json['error'])
+            response.raise_for_status()
            return res_json['choices'][0]['message']['content']
        except Exception as e:
-            logging.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}")
+            try:
+                logger.debug(response.text)
+            except:
+                pass
+            logger.error(f"Error on attempt {attempt} for judge model {judge_model}: {e}")
+            # TODO print response header or erro
            if attempt == max_retries:
-                logging.critical(f"Max retries reached for judge model {judge_model}")
+                logger.critical(f"Max retries reached for judge model {judge_model}")
                raise
            time.sleep(RETRY_DELAY)
-    return ""
+    return ""
@@ -1,5 +1,5 @@
 import json
-import logging
+from loguru import logger

 def load_json_file(file_path: str) -> dict:
    """Loads a JSON file (returns empty if not found)."""
@@ -7,11 +7,11 @@ def load_json_file(file_path: str) -> dict:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
-        logging.warning(f"File {file_path} not found, returning empty dict.")
+        logger.warning(f"File {file_path} not found, returning empty dict.")
        return {}

 def save_json_file(data: dict, file_path: str):
    """Saves a dict to disk as JSON."""
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)
-    logging.debug(f"Saved JSON data to {file_path}")
+    logger.debug(f"Saved JSON data to {file_path}")
@@ -1,24 +1,15 @@
 import os
-import logging
+import sys
+# from loguru import logger
+from loguru import logger

 def setup_logging(verbosity: str):
    """Set up logging based on verbosity level."""
-    log_levels = {
-        'DEBUG': logging.DEBUG,
-        'INFO': logging.INFO,
-        'WARNING': logging.WARNING,
-        'ERROR': logging.ERROR,
-        'CRITICAL': logging.CRITICAL
-    }
-    log_level = log_levels.get(verbosity.upper(), logging.INFO)
-    logging.basicConfig(
-        level=log_level,
-        format='%(asctime)s - %(levelname)s - %(threadName)s - %(message)s',
-        datefmt='%Y-%m-%d %H:%M:%S'
-    )
+    logger.remove()
+    logger.add(sys.stderr, level=verbosity)

 def get_verbosity(args_verbosity: str) -> str:
    """Determine the verbosity level from command-line or environment."""
    if args_verbosity:
        return args_verbosity
-    return os.getenv("LOG_VERBOSITY", "INFO")
+    return os.getenv("LOG_VERBOSITY", "INFO")
@@ -0,0 +1,329 @@
+{
+  "8fd38fec-5b00-4199-a74f-db423a762d8f__openai_gpt-4o-mini": {
+    "judge_model": "openai/gpt-4o-mini",
+    "start_time": "2025-07-23T17:11:03.844488",
+    "status": "running",
+    "samples_file": "data/judgemark_v2.1_samples.json",
+    "prompts_file": "data/judge_prompts.json",
+    "results": {
+      "claude-3-5-sonnet-20240620": {
+        "1": {
+          "2": {
+            "parsed_scores": {
+              "Original; Not Derivative": 8.0,
+              "Meaningful Integration of Political and Social Context": 9.0,
+              "Nuanced and Insightful Portrayal of Gladiator's Inner Life": 9.0,
+              "Reads Like Part of a Larger Story": 8.0,
+              "Authentic and Engrossing Ancient Roman Setting": 9.0,
+              "Vivid and Immersive Sensory Details": 9.0,
+              "Imagery and Descriptive Quality": 8.0,
+              "Elegant Prose": 8.0,
+              "Emotionally Engaging": 9.0,
+              "Emotionally Complex": 8.0,
+              "Coherent": 10.0,
+              "Adherence to Instructions": 9.0,
+              "Believable Character Actions": 9.0,
+              "Nuanced Characters": 8.0,
+              "Consistent Voice/Tone of Writing": 9.0,
+              "Meandering": 1.0,
+              "Weak Dialogue": 2.0,
+              "Tell-Don't-Show": 1.0,
+              "Unsurprising or Uncreative": 2.0,
+              "Amateurish": 1.0,
+              "Purple Prose": 2.0,
+              "Overwrought": 1.0,
+              "Incongruent Ending Positivity": 1.0,
+              "Unearned Transformations": 1.0,
+              "Well-earned Lightness or Darkness": 8.0,
+              "Sentences Flow Naturally": 9.0,
+              "Overall Reader Engagement": 9.0,
+              "Overall Impression": 9.0
+            },
+            "timestamp": "2025-07-23T17:13:06.569955",
+            "text_length": 5262,
+            "aggregated_score_raw": 8.68,
+            "judge_response": "[Analysis]\n\nThe test model's response offers a vivid and immersive portrayal of a day in the life of a gladiator, skillfully integrating sensory details to create an authentic ancient Roman setting. The narrative captures the protagonist's physical and emotional struggles, reflecting on the larger social and political issues of the time, such as the disparity between the patricians and the plebeians. The inner life of the gladiator is nuanced and insightful, revealing a deep sense of resentment and longing for freedom, which adds complexity to the character.\n\nThe writing reads cohesively as part of a larger story, with a clear trajectory that hints at future developments in the protagonist's journey. The sensory details are particularly strong, with effective imagery that engages the reader's senses and brings the setting to life. The prose is generally elegant, with a good command of language that avoids excessive ornamentation, although there are moments where the description could be trimmed for tighter pacing.\n\nEmotionally, the piece resonates well, as it explores themes of identity, struggle, and the desire for agency, making it relatable and compelling. The character's actions and reactions feel believable and consistent with his circumstances, and the voice throughout remains steady and appropriate for the setting.\n\nWhile the dialogue is minimal, it serves its purpose, providing insights into character relationships and the gladiatorial world without feeling forced. The narrative does not meander, maintaining focus on the protagonist's experiences and thoughts. There are no instances of unearned positivity or transformations, and the ending feels congruent with the overall tone of the piece.\n\nOverall, the response is engaging and well-crafted, successfully meeting the prompt's requirements while delivering a strong emotional and thematic impact.\n\n[Scores]\n\nOriginal; Not Derivative: 8  \nMeaningful Integration of Political and Social Context: 9  \nNuanced and Insightful Portrayal of Gladiator's Inner Life: 9  \nReads Like Part of a Larger Story: 8  \nAuthentic and Engrossing Ancient Roman Setting: 9  \nVivid and Immersive Sensory Details: 9  \nImagery and Descriptive Quality: 8  \nElegant Prose: 8  \nEmotionally Engaging: 9  \nEmotionally Complex: 8  \nCoherent: 10  \nAdherence to Instructions: 9  \nBelievable Character Actions: 9  \nNuanced Characters: 8  \nConsistent Voice/Tone of Writing: 9  \nMeandering: 1  \nWeak Dialogue: 2  \nTell-Don't-Show: 1  \nUnsurprising or Uncreative: 2  \nAmateurish: 1  \nPurple Prose: 2  \nOverwrought: 1  \nIncongruent Ending Positivity: 1  \nUnearned Transformations: 1  \nWell-earned Lightness or Darkness: 8  \nSentences Flow Naturally: 9  \nOverall Reader Engagement: 9  \nOverall Impression: 9  "
+          }
+        }
+      }
+    }
+  },
+  "c4337076-bb5e-4213-98ac-558fb7a65406__openai_gpt-4o-mini": {
+    "judge_model": "openai/gpt-4o-mini",
+    "start_time": "2025-07-23T17:15:02.537833",
+    "status": "running",
+    "samples_file": "data/judgemark_v2.1_samples.json",
+    "prompts_file": "data/judge_prompts.json",
+    "results": {
+      "claude-3-5-sonnet-20240620": {
+        "1": {
+          "2": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:15:13.424622"
+          },
+          "6": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:15:23.701266"
+          },
+          "9": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:15:33.975049"
+          },
+          "10": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:15:44.290077"
+          },
+          "19": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:15:54.619007"
+          },
+          "20": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:16:04.990844"
+          }
+        }
+      }
+    },
+    "errors": [
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "2",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "6",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "9",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "10",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "19",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "20",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      }
+    ]
+  },
+  "ae5dd791-be66-4ec7-a73e-22f9cb18273e__openai_gpt-4o-mini": {
+    "judge_model": "openai/gpt-4o-mini",
+    "start_time": "2025-07-23T17:17:04.826144",
+    "status": "running",
+    "samples_file": "data/judgemark_v2.1_samples.json",
+    "prompts_file": "data/judge_prompts.json",
+    "results": {
+      "claude-3-5-sonnet-20240620": {
+        "1": {
+          "2": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:17:15.449268"
+          },
+          "6": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:17:25.869672"
+          },
+          "9": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:17:36.203090"
+          },
+          "10": {
+            "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions",
+            "timestamp": "2025-07-23T17:17:46.855039"
+          }
+        }
+      }
+    },
+    "errors": [
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "2",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "6",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "9",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "10",
+        "error": "404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions"
+      }
+    ]
+  },
+  "62c4fff4-0382-47db-b419-d5acfb30ce1b__openai_gpt-4o-mini": {
+    "judge_model": "openai/gpt-4o-mini",
+    "start_time": "2025-07-23T17:18:48.064895",
+    "status": "running",
+    "samples_file": "data/judgemark_v2.1_samples.json",
+    "prompts_file": "data/judge_prompts.json",
+    "results": {
+      "claude-3-5-sonnet-20240620": {
+        "1": {
+          "2": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:18:58.737280"
+          },
+          "6": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:19:09.036124"
+          },
+          "9": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:19:19.566282"
+          },
+          "10": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:19:29.887922"
+          },
+          "19": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:19:40.198755"
+          },
+          "20": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:19:50.559484"
+          },
+          "22": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:20:00.907318"
+          }
+        }
+      }
+    },
+    "errors": [
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "2",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "6",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "9",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "10",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "19",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "20",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "22",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      }
+    ]
+  },
+  "31ffba42-2510-454b-8b48-4965f32b7b01__openai_gpt-4o-mini": {
+    "judge_model": "openai/gpt-4o-mini",
+    "start_time": "2025-07-23T17:20:50.781947",
+    "status": "running",
+    "samples_file": "data/judgemark_v2.1_samples.json",
+    "prompts_file": "data/judge_prompts.json",
+    "results": {}
+  },
+  "5bb9aad6-8fc4-4b2d-aeea-c836534a110a__openai_gpt-4o-mini": {
+    "judge_model": "openai/gpt-4o-mini",
+    "start_time": "2025-07-23T17:20:56.195127",
+    "status": "running",
+    "samples_file": "data/judgemark_v2.1_samples.json",
+    "prompts_file": "data/judge_prompts.json",
+    "results": {
+      "claude-3-5-sonnet-20240620": {
+        "1": {
+          "2": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:21:06.605472"
+          },
+          "6": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:21:17.026079"
+          },
+          "9": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:21:27.317695"
+          },
+          "10": {
+            "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}",
+            "timestamp": "2025-07-23T17:21:37.664878"
+          }
+        }
+      }
+    },
+    "errors": [
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "2",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "6",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "9",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      },
+      {
+        "model": "claude-3-5-sonnet-20240620",
+        "iteration": "1",
+        "item_id": "10",
+        "error": "{'message': 'No endpoints found that can handle the requested parameters. To learn more about provider routing, visit: https://openrouter.ai/docs/provider-routing', 'code': 404}"
+      }
+    ]
+  }
+}
@@ -5,6 +5,7 @@ description = "**Judgemark V2** is a benchmark that evaluates how well a languag
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
+    "loguru>=0.7.3",
    "matplotlib>=3.7",
    "python-dotenv>=1.1.1",
    "scipy>=1.10",
@@ -527,6 +527,7 @@ name = "judgemark-v2lp"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
+    { name = "loguru" },
    { name = "matplotlib" },
    { name = "python-dotenv" },
    { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -542,6 +543,7 @@ dev = [

 [package.metadata]
 requires-dist = [
+    { name = "loguru", specifier = ">=0.7.3" },
    { name = "matplotlib", specifier = ">=3.7" },
    { name = "python-dotenv", specifier = ">=1.1.1" },
    { name = "scipy", specifier = ">=1.10" },
@@ -680,6 +682,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/3a/1d/50ad811d1c5dae091e4cf046beba925bcae0a610e79ae4c538f996f63ed5/kiwisolver-1.4.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:65ea09a5a3faadd59c2ce96dc7bf0f364986a315949dc6374f04396b0d60e09b", size = 71762, upload-time = "2024-12-24T18:30:48.903Z" },
 ]

+[[package]]
+name = "loguru"
+version = "0.7.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
+]
+
 [[package]]
 name = "matplotlib"
 version = "3.10.3"
@@ -1600,3 +1615,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/41/53/2e0253c5efd69c965
 wheels = [
    { url = "https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl", hash = "sha256:4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575", size = 2196503, upload-time = "2025-04-10T13:01:23.086Z" },
 ]
+
+[[package]]
+name = "win32-setctime"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
+]