refactor to uv package

2026-06-26 16:00:34 +08:00 · 2025-07-23 16:58:35 +08:00
parent 372bbaa072
commit 5615c809e1
20 changed files with 1670 additions and 56 deletions
@@ -5,4 +5,6 @@ judgemark_v2_runs.json
 judgemark_v2.1_runs.json
 ministral-repetition-results.json
 *.pyc
-dev
+dev
+*.egg-info
+__pycache__
@@ -0,0 +1 @@
+3.10
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2025 Samuel Paech
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
@@ -1,3 +1,9 @@
+Fork of judgemark to see if using weighted logprob, or ranklogprob work better than the current method
+
+
+
+----
+
 # Judgemark V2

 **Judgemark V2** is a benchmark that evaluates how well a language model can judge creative writing. Instead of relying on simple pairwise preferences, Judgemark V2 prompts the judge model to assign numeric scores for multiple literary criteria (e.g., “Nuanced Characters,” “Overwrought,” “Emotionally Engaging”). It then aggregates those scores, measures how consistent and discriminative they are, and derives a final numeric rating of the judge model’s performance.
@@ -117,4 +123,4 @@ This project is licensed under an [MIT License](LICENSE). See the `LICENSE` file

 ---

-**Happy Judging!** If you have any questions, reach out via [GitHub Issues](https://github.com/EQ-bench/judgemark-v2/issues) or contact the maintainers.
+**Happy Judging!** If you have any questions, reach out via [GitHub Issues](https://github.com/EQ-bench/judgemark-v2/issues) or contact the maintainers.
@@ -5,11 +5,11 @@ import logging
 import time


-from utils.logging_setup import setup_logging, get_verbosity
-from utils.file_io import load_json_file
-from core.benchmark import run_judgemark_v2
-from utils.api import API_KEY
-from utils.state import should_exit, executor
+from judgemark_v2lp.utils.logging_setup import setup_logging, get_verbosity
+from judgemark_v2lp.utils.file_io import load_json_file
+from judgemark_v2lp.benchmark import run_judgemark_v2
+from judgemark_v2lp.utils.api import API_KEY
+from judgemark_v2lp.utils.state import should_exit, executor


 def signal_handler(signum, frame):
@@ -122,4 +122,4 @@ if __name__ == "__main__":
        final_score = rd.get("final_judgemark_score", "N/A")
        logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
        print(f"Run ID: {rid}")
-        print(f"Final Judgemark-v2 Score: {final_score}")
+        print(f"Final Judgemark-v2 Score: {final_score}")
@@ -11,21 +11,20 @@ from typing import Dict, List
 from tqdm import tqdm
 from collections import defaultdict

-from utils.file_io import load_json_file, save_json_file
-from utils.api import send_to_judge_model
-from utils.visualization import create_side_by_side_score_charts
-import statistics
-from core.scoring import (
+from judgemark_v2lp.utils.file_io import load_json_file, save_json_file
+from judgemark_v2lp.utils.api import send_to_judge_model
+from judgemark_v2lp.utils.visualization import create_side_by_side_score_charts
+from judgemark_v2lp.core.scoring import (
    parse_scores, compute_raw_score, compute_detailed_distribution,
    compute_model_level_stats, compute_cross_model_stats,
    build_landmark_calibration_config, apply_landmark_calibration,
    log_score_summary, confidence_interval_95
 )
-from core.scoring import compute_detailed_distribution, compute_detailed_distribution  # etc
-from core.separability import compute_separability_metrics
-from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
-from utils.stats import normalize, modulate_x_by_y
-from utils.state import should_exit, executor
+from judgemark_v2lp.core.scoring import compute_detailed_distribution, compute_detailed_distribution  # etc
+from judgemark_v2lp.core.separability import compute_separability_metrics
+from judgemark_v2lp.core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
+from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y
+from judgemark_v2lp.utils.state import should_exit, executor

 def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str, 
                  prompt_template: str, run_key: str, runs: Dict, runs_file: str,
@@ -519,4 +518,4 @@ def run_judgemark_v2(
            executor = None
    
    logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
-    return run_key
+    return run_key
@@ -55,4 +55,4 @@ MODEL_NAME_REPLACEMENTS = {
    "cohere/command-r-08-2024": "CohereForAI/c4ai-command-r-08-2024",
    "google/gemini-pro-1.5": "gemini-pro-1_5",
    "openai/o3-mini": "o3-mini",
-}
+}
@@ -5,8 +5,8 @@ import scipy.stats
 import logging
 from typing import Dict, List
 import re
-from config.constants import REFERENCE_MODEL_SCORES
-from utils.stats import normalize
+from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES
+from judgemark_v2lp.utils.stats import normalize

 def parse_scores(judge_model_response: str) -> Dict[str,float]:
    """
@@ -272,4 +272,4 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
    for model, stats in sorted_models:
        line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
        logging.info(line)
-    logging.info("------------------------------------")
+    logging.info("------------------------------------")
@@ -5,7 +5,7 @@ import statistics
 import numpy as np
 import scipy.stats
 from typing import Dict, List, Tuple
-from utils.stats import normalize, modulate_x_by_y
+from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y

 try:
    from scipy.stats import wasserstein_distance
@@ -4,11 +4,11 @@ from concurrent.futures import ThreadPoolExecutor
 from tqdm import tqdm
 from typing import Dict

-from utils.api import send_to_judge_model
-from utils.file_io import save_json_file
-from core.scoring import parse_scores, compute_raw_score
-from config.constants import STABILITY_ITEMS, STABILITY_REPS
-from utils.state import should_exit, executor
+from judgemark_v2lp.utils.api import send_to_judge_model
+from judgemark_v2lp.utils.file_io import save_json_file
+from judgemark_v2lp.scoring import parse_scores, compute_raw_score
+from judgemark_v2lp.config.constants import STABILITY_ITEMS, STABILITY_REPS
+from judgemark_v2lp.utils.state import should_exit, executor
 from collections import defaultdict
 import statistics
 import math
@@ -445,4 +445,4 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs,
                else:
                    logging.warning(f"Got invalid score for stability item {key_name}, will need retry")
            except Exception as exc:
-                logging.error(f"Exception in stability test: {exc}")
+                logging.error(f"Exception in stability test: {exc}")
@@ -5,7 +5,7 @@ import matplotlib.ticker as ticker
 from typing import Dict
 from scipy.stats import linregress
 from scipy.stats import spearmanr, theilslopes
-from config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
+from judgemark_v2lp.config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS


 def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict):
@@ -237,4 +237,4 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
    
    plt.tight_layout()
    plt.savefig(f"results/charts/judgemark_scattergrid_{sanitized_judge}.png", bbox_inches='tight', dpi=200)
-    plt.close(fig2)
+    plt.close(fig2)
@@ -0,0 +1,28 @@
+[project]
+name = "judgemark_v2lp"
+version = "0.1.0"
+description = "**Judgemark V2** is a benchmark that evaluates how well a language model can judge creative writing"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "matplotlib>=3.7",
+    "python-dotenv>=1.1.1",
+    "scipy>=1.10",
+    "transformers>=4.26",
+]
+
+
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[dependency-groups]
+dev = [
+    "ipykernel>=6.29.5",
+    "ipywidgets>=8.1.7",
+]
+
+
+[tool.setuptools.packages.find]
+where = ["."]  # search the root directory
+include = ["judgemark_v2lp*"]
@@ -1,3 +0,0 @@
-matplotlib>=3.7
-transformers>=4.26
-scipy>=1.10