mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
refactor to uv package
This commit is contained in:
@@ -6,3 +6,5 @@ judgemark_v2.1_runs.json
|
||||
ministral-repetition-results.json
|
||||
*.pyc
|
||||
dev
|
||||
*.egg-info
|
||||
__pycache__
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
3.10
|
||||
@@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Samuel Paech
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -1,3 +1,9 @@
|
||||
Fork of judgemark to see if using weighted logprob, or ranklogprob work better than the current method
|
||||
|
||||
|
||||
|
||||
----
|
||||
|
||||
# Judgemark V2
|
||||
|
||||
**Judgemark V2** is a benchmark that evaluates how well a language model can judge creative writing. Instead of relying on simple pairwise preferences, Judgemark V2 prompts the judge model to assign numeric scores for multiple literary criteria (e.g., “Nuanced Characters,” “Overwrought,” “Emotionally Engaging”). It then aggregates those scores, measures how consistent and discriminative they are, and derives a final numeric rating of the judge model’s performance.
|
||||
|
||||
+5
-5
@@ -5,11 +5,11 @@ import logging
|
||||
import time
|
||||
|
||||
|
||||
from utils.logging_setup import setup_logging, get_verbosity
|
||||
from utils.file_io import load_json_file
|
||||
from core.benchmark import run_judgemark_v2
|
||||
from utils.api import API_KEY
|
||||
from utils.state import should_exit, executor
|
||||
from judgemark_v2lp.utils.logging_setup import setup_logging, get_verbosity
|
||||
from judgemark_v2lp.utils.file_io import load_json_file
|
||||
from judgemark_v2lp.benchmark import run_judgemark_v2
|
||||
from judgemark_v2lp.utils.api import API_KEY
|
||||
from judgemark_v2lp.utils.state import should_exit, executor
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
|
||||
@@ -11,21 +11,20 @@ from typing import Dict, List
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
from utils.file_io import load_json_file, save_json_file
|
||||
from utils.api import send_to_judge_model
|
||||
from utils.visualization import create_side_by_side_score_charts
|
||||
import statistics
|
||||
from core.scoring import (
|
||||
from judgemark_v2lp.utils.file_io import load_json_file, save_json_file
|
||||
from judgemark_v2lp.utils.api import send_to_judge_model
|
||||
from judgemark_v2lp.utils.visualization import create_side_by_side_score_charts
|
||||
from judgemark_v2lp.core.scoring import (
|
||||
parse_scores, compute_raw_score, compute_detailed_distribution,
|
||||
compute_model_level_stats, compute_cross_model_stats,
|
||||
build_landmark_calibration_config, apply_landmark_calibration,
|
||||
log_score_summary, confidence_interval_95
|
||||
)
|
||||
from core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
|
||||
from core.separability import compute_separability_metrics
|
||||
from core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
|
||||
from utils.stats import normalize, modulate_x_by_y
|
||||
from utils.state import should_exit, executor
|
||||
from judgemark_v2lp.core.scoring import compute_detailed_distribution, compute_detailed_distribution # etc
|
||||
from judgemark_v2lp.core.separability import compute_separability_metrics
|
||||
from judgemark_v2lp.core.stability import run_stability_test, compute_iteration_stability, compute_randomized_iteration_rank_stability_by_item
|
||||
from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y
|
||||
from judgemark_v2lp.utils.state import should_exit, executor
|
||||
|
||||
def process_sample(model_name: str, iteration_key: str, item_id: str, item_text: str,
|
||||
prompt_template: str, run_key: str, runs: Dict, runs_file: str,
|
||||
@@ -5,8 +5,8 @@ import scipy.stats
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
import re
|
||||
from config.constants import REFERENCE_MODEL_SCORES
|
||||
from utils.stats import normalize
|
||||
from judgemark_v2lp.config.constants import REFERENCE_MODEL_SCORES
|
||||
from judgemark_v2lp.utils.stats import normalize
|
||||
|
||||
def parse_scores(judge_model_response: str) -> Dict[str,float]:
|
||||
"""
|
||||
@@ -5,7 +5,7 @@ import statistics
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
from typing import Dict, List, Tuple
|
||||
from utils.stats import normalize, modulate_x_by_y
|
||||
from judgemark_v2lp.utils.stats import normalize, modulate_x_by_y
|
||||
|
||||
try:
|
||||
from scipy.stats import wasserstein_distance
|
||||
@@ -4,11 +4,11 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from tqdm import tqdm
|
||||
from typing import Dict
|
||||
|
||||
from utils.api import send_to_judge_model
|
||||
from utils.file_io import save_json_file
|
||||
from core.scoring import parse_scores, compute_raw_score
|
||||
from config.constants import STABILITY_ITEMS, STABILITY_REPS
|
||||
from utils.state import should_exit, executor
|
||||
from judgemark_v2lp.utils.api import send_to_judge_model
|
||||
from judgemark_v2lp.utils.file_io import save_json_file
|
||||
from judgemark_v2lp.scoring import parse_scores, compute_raw_score
|
||||
from judgemark_v2lp.config.constants import STABILITY_ITEMS, STABILITY_REPS
|
||||
from judgemark_v2lp.utils.state import should_exit, executor
|
||||
from collections import defaultdict
|
||||
import statistics
|
||||
import math
|
||||
@@ -5,7 +5,7 @@ import matplotlib.ticker as ticker
|
||||
from typing import Dict
|
||||
from scipy.stats import linregress
|
||||
from scipy.stats import spearmanr, theilslopes
|
||||
from config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
|
||||
from judgemark_v2lp.config.constants import NEGATIVE_MARKERS, MODEL_NAME_REPLACEMENTS
|
||||
|
||||
|
||||
def create_side_by_side_score_charts(run_data: Dict, judge_model: str, samples_data: Dict):
|
||||
@@ -0,0 +1,28 @@
|
||||
[project]
|
||||
name = "judgemark_v2lp"
|
||||
version = "0.1.0"
|
||||
description = "**Judgemark V2** is a benchmark that evaluates how well a language model can judge creative writing"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"matplotlib>=3.7",
|
||||
"python-dotenv>=1.1.1",
|
||||
"scipy>=1.10",
|
||||
"transformers>=4.26",
|
||||
]
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=61"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ipykernel>=6.29.5",
|
||||
"ipywidgets>=8.1.7",
|
||||
]
|
||||
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."] # search the root directory
|
||||
include = ["judgemark_v2lp*"]
|
||||
@@ -1,3 +0,0 @@
|
||||
matplotlib>=3.7
|
||||
transformers>=4.26
|
||||
scipy>=1.10
|
||||
Reference in New Issue
Block a user