import sys import signal import argparse from loguru import logger import time from judgemark_v2lp.benchmark import sanitize_model_name from pathlib import Path from judgemark_v2lp.utils.logging_setup import setup_logging, get_verbosity from judgemark_v2lp.utils.file_io import load_json_file from judgemark_v2lp.benchmark import run_judgemark_v2 from judgemark_v2lp.utils.api import API_KEY from judgemark_v2lp.utils.state import should_exit, executor def signal_handler(signum, frame): """Handle interrupt signals (SIGINT, SIGTERM).""" global executor, should_exit print(f"\n[DEBUG] Signal {signum} caught!") logger.warning("Signal handler called") should_exit = True time.sleep(0.1) # Give workers a moment to see the flag if executor: logger.info("Shutting down executor from signal handler") executor.shutdown(wait=False) logger.info("Executor shutdown complete") sys.exit(1) def parse_args(): parser = argparse.ArgumentParser(description='Run Judgemark-v2 Benchmark') parser.add_argument( '--judge-model', required=True, help='Judge model identifier (e.g., openai/gpt-4)' ) parser.add_argument( '--samples-file', default="data/judgemark_v2.1_samples.json", help='JSON file containing pre-generated samples from various writer models' ) parser.add_argument( '--prompts-file', default="data/judge_prompts.json", help='JSON file containing the partial judge prompts to be filled with test responses' ) parser.add_argument( '--runs-file', default=None, help='Path to store the Judgemark run results', type=Path, ) parser.add_argument( '--run-id', help='Resume (or create) a run using this base ID, to be combined with the judge model name', ) parser.add_argument( '--threads', type=int, default=0, help='Number of threads to use' ) parser.add_argument( '--verbosity', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging verbosity level', default='INFO' # Default to INFO if not specified ) parser.add_argument( '--num-runs', type=int, default=1, help='Number of benchmark runs to execute' ) parser.add_argument( '--save-raw-judge-output', dest='save_raw_judge_output', action='store_false', default=True, help='If provided, do NOT store the raw judge model output in the results JSON (default: store raw judge output)' ) # parser.add_argument( # '--score-weighted', # action='store_true', # default=False, # help='If set, use weighted scoring for the judge model (default: false)' # ) # parser.add_argument( # '--score-ranked', # action='store_true', # default=False, # help='If set, use ranked logprob scoring for the judge model (default: false)' # ) args = parser.parse_args() if args.runs_file is None: sanitized_jm = sanitize_model_name(args.judge_model) ts = time.strftime("%Y%m%d_%H%M%S") args.runs_file = Path(f"outputs/{args.run_id or ''}_{sanitized_jm}_{ts}.json") return args if __name__ == "__main__": # Register signal handlers signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # Reset sentinel should_exit = False # Parse args args = parse_args() # Setup logging verbosity = get_verbosity(args.verbosity) setup_logging(verbosity) logger.debug("Logging initialized") run_name = args.runs_file.stem logger.add(f"outputs/{run_name}.log", level=verbosity, rotation="10 MB", enqueue=True) logger.info(f"Logging to outputs/{run_name}.log") # Check that we have an API key for the judge model if not API_KEY: logger.critical("No OPENAI_API_KEY found in environment variables.") raise ValueError("OPENAI_API_KEY not found in environment variables.") run_ids = [] for i in range(1, args.num_runs + 1): if should_exit: break logger.info(f"Starting Judgemark-v2 run {i} of {args.num_runs}") rid = run_judgemark_v2( judge_model=args.judge_model, samples_file=args.samples_file, prompts_file=args.prompts_file, runs_file=args.runs_file, num_threads=args.threads, run_id=args.run_id, save_raw_judge_output=args.save_raw_judge_output ) run_ids.append(rid) # Finally, print summary runs = load_json_file(args.runs_file) logger.info("\nAll Judgemark-v2 runs completed:") print("\nAll Judgemark-v2 runs completed:") for rid in run_ids: rd = runs.get(rid, {}) final_score = rd.get("final_judgemark_score", "N/A") logger.info(f"Run ID: {rid}, Final Judgemark Score: {final_score}") print(f"Run ID: {rid}") print(f"Final Judgemark-v2 Score: {final_score}")