refactor to uv package

This commit is contained in:
wassname
2025-07-23 16:58:35 +08:00
parent 372bbaa072
commit 5615c809e1
20 changed files with 1670 additions and 56 deletions
+3 -1
View File
@@ -5,4 +5,6 @@ judgemark_v2_runs.json
judgemark_v2.1_runs.json
ministral-repetition-results.json
*.pyc
dev
dev
*.egg-info
__pycache__
+1
View File
@@ -0,0 +1 @@
3.10
-21
View File
@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2025 Samuel Paech
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+7 -1
View File
@@ -1,3 +1,9 @@
Fork of judgemark to see if using weighted logprob, or ranklogprob work better than the current method
----
# Judgemark V2
**Judgemark V2** is a benchmark that evaluates how well a language model can judge creative writing. Instead of relying on simple pairwise preferences, Judgemark V2 prompts the judge model to assign numeric scores for multiple literary criteria (e.g., “Nuanced Characters,” “Overwrought,” “Emotionally Engaging”). It then aggregates those scores, measures how consistent and discriminative they are, and derives a final numeric rating of the judge models performance.
@@ -117,4 +123,4 @@ This project is licensed under an [MIT License](LICENSE). See the `LICENSE` file
---
**Happy Judging!** If you have any questions, reach out via [GitHub Issues](https://github.com/EQ-bench/judgemark-v2/issues) or contact the maintainers.
**Happy Judging!** If you have any questions, reach out via [GitHub Issues](https://github.com/EQ-bench/judgemark-v2/issues) or contact the maintainers.
+6 -6
View File
@@ -5,11 +5,11 @@ import logging
import time
from utils.logging_setup import setup_logging, get_verbosity
from utils.file_io import load_json_file
from core.benchmark import run_judgemark_v2
from utils.api import API_KEY
from utils.state import should_exit, executor
from judgemark_v2lp.utils.logging_setup import setup_logging, get_verbosity
from judgemark_v2lp.utils.file_io import load_json_file
from judgemark_v2lp.benchmark import run_judgemark_v2
from judgemark_v2lp.utils.api import API_KEY
from judgemark_v2lp.utils.state import should_exit, executor
def signal_handler(signum, frame):
@@ -122,4 +122,4 @@ if __name__ == "__main__":
final_score = rd.get("final_judgemark_score", "N/A")
logging.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}")
print(f"Run ID: {rid}")
print(f"Final Judgemark-v2 Score: {final_score}")
print(f"Final Judgemark-v2 Score: {final_score}")
@@ -11,21 +11,20 @@ from typing import Dict, List
from tqdm import tqdm
from collections import defaultdict
from utils.file_io import load_json_file, save_json_file
from utils.api import send_to_judge_model
from utils.visualization import create_side_by_side_score_charts
import statistics
from core.scoring import (
from judgemark_v2lp.utils.file_io import load_json_file, save_json_file
from judgemark_v2lp.utils.api import send_to_judge_model
from judgemark_v2lp.utils.visualization import create_side_by_side_score_charts
from judgemark_v2lp.core.scoring import (
parse_scores, compute_raw_score, compute_detailed_distribution,
compute_model_level_stats, compute_cross_model_stats,
build_landmark_calibration_config, apply_landmark_calibration,
log_score_summary, confidence_interval_95
)
from core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
from core.separability import compute_separability_metrics
from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
from utils.stats import normalize, modulate_x_by_y
from utils.state import should_exit, executor
from judgemark_v2lp.core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
from judgemark_v2lp.core.separability import compute_separability_metrics
from judgemark_v2lp.core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y
from judgemark_v2lp.utils.state import should_exit, executor
def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str,
prompt_template: str, run_key: str, runs: Dict, runs_file: str,
@@ -519,4 +518,4 @@ def run_judgemark_v2(
executor = None
logging.info(f"Judgemark-v2 run {run_key} ended with status: {status}")
return run_key
return run_key
@@ -55,4 +55,4 @@ MODEL_NAME_REPLACEMENTS = {
"cohere/command-r-08-2024": "CohereForAI/c4ai-command-r-08-2024",
"google/gemini-pro-1.5": "gemini-pro-1_5",
"openai/o3-mini": "o3-mini",
}
}
@@ -5,8 +5,8 @@ import scipy.stats
import logging
from typing import Dict, List
import re
from config.constants import REFERENCE_MODEL_SCORES
from utils.stats import normalize
from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES
from judgemark_v2lp.utils.stats import normalize
def parse_scores(judge_model_response: str) -> Dict[str,float]:
"""
@@ -272,4 +272,4 @@ def log_score_summary(score_type: str, cross_stats: Dict, model_stats: Dict):
for model, stats in sorted_models:
line = f"{model:.<40} {stats['mean']:.3f} ±{stats['ci95']:.3f}"
logging.info(line)
logging.info("------------------------------------")
logging.info("------------------------------------")
@@ -5,7 +5,7 @@ import statistics
import numpy as np
import scipy.stats
from typing import Dict, List, Tuple
from utils.stats import normalize, modulate_x_by_y
from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y
try:
from scipy.stats import wasserstein_distance
@@ -4,11 +4,11 @@ from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from typing import Dict
from utils.api import send_to_judge_model
from utils.file_io import save_json_file
from core.scoring import parse_scores, compute_raw_score
from config.constants import STABILITY_ITEMS, STABILITY_REPS
from utils.state import should_exit, executor
from judgemark_v2lp.utils.api import send_to_judge_model
from judgemark_v2lp.utils.file_io import save_json_file
from judgemark_v2lp.scoring import parse_scores, compute_raw_score
from judgemark_v2lp.config.constants import STABILITY_ITEMS, STABILITY_REPS
from judgemark_v2lp.utils.state import should_exit, executor
from collections import defaultdict
import statistics
import math
@@ -445,4 +445,4 @@ def run_stability_test(run_data, judge_model, judge_prompts, samples_data, runs,
else:
logging.warning(f"Got invalid score for stability item {key_name}, will need retry")
except Exception as exc:
logging.error(f"Exception in stability test: {exc}")
logging.error(f"Exception in stability test: {exc}")
View File
@@ -5,7 +5,7 @@ import matplotlib.ticker as ticker
from typing import Dict
from scipy.stats import linregress
from scipy.stats import spearmanr, theilslopes
from config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
from judgemark_v2lp.config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict):
@@ -237,4 +237,4 @@ def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_d
plt.tight_layout()
plt.savefig(f"results/charts/judgemark_scattergrid_{sanitized_judge}.png", bbox_inches='tight', dpi=200)
plt.close(fig2)
plt.close(fig2)
+28
View File
@@ -0,0 +1,28 @@
[project]
name = "judgemark_v2lp"
version = "0.1.0"
description = "**Judgemark V2** is a benchmark that evaluates how well a language model can judge creative writing"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"matplotlib>=3.7",
"python-dotenv>=1.1.1",
"scipy>=1.10",
"transformers>=4.26",
]
[build-system]
requires = ["setuptools>=61"]
build-backend = "setuptools.build_meta"
[dependency-groups]
dev = [
"ipykernel>=6.29.5",
"ipywidgets>=8.1.7",
]
[tool.setuptools.packages.find]
where = ["."] # search the root directory
include = ["judgemark_v2lp*"]
-3
View File
@@ -1,3 +0,0 @@
matplotlib>=3.7
transformers>=4.26
scipy>=1.10
Generated
+1602
View File
File diff suppressed because it is too large Load Diff