Judgemark-v2lp/nbs/02_recomp_multi.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "426cbec8",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bdc690c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from judgemark_v2lp.utils.file_io import load_json_file, save_json_file\n",
    "from judgemark_v2lp.benchmark import sanitize_model_name, finalize_scores_and_compute_judgemark\n",
    "import uuid\n",
    "from tqdm import tqdm\n",
    "from judgemark_v2lp.scoring import compute_ranked_score, compute_raw_score, compute_weighted_score\n",
    "import numpy as np\n",
    "from tqdm.auto import tqdm\n",
    "import polars as pl\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "150573a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "samples_file = \"../data/judgemark_v2.1_samples.json\"\n",
    "samples_data = load_json_file(samples_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bfedfcdc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from loguru import logger\n",
    "import sys\n",
    "logger.remove()\n",
    "logger.add(sys.stderr, level=\"INFO\", format=\"{message}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "edd6567d",
   "metadata": {},
   "outputs": [],
   "source": [
    "fs = sorted(Path(\"../outputs\").glob(\"my_judgemark_runs*.json\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9569d5be",
   "metadata": {},
   "outputs": [],
   "source": [
    "def recompute_scores_with_choice_norm(run):\n",
    "\n",
    "    for model_name in (run['results'].keys()):\n",
    "        logprobs = []\n",
    "        for iteration_key in (run['results'][model_name].keys()):\n",
    "            for item_id in (run['results'][model_name][iteration_key].keys()):\n",
    "                storage_dict = run['results'][model_name][iteration_key][item_id]\n",
    "                logp = storage_dict['logp']\n",
    "                lpv = list(logp.values())\n",
    "                if len(lpv) == 0:\n",
    "                    continue\n",
    "                logprobs.append(np.stack(lpv))\n",
    "        \n",
    "        # get the log prob mean so we can normalise on a per choice basis, this lets us avoid e.g. a bias for the number 1\n",
    "        logprobs2 = np.concatenate(logprobs)\n",
    "        log_prob_mean = logprobs2.mean(0)\n",
    "\n",
    "        for iteration_key in (run['results'][model_name].keys()):\n",
    "            for item_id in (run['results'][model_name][iteration_key].keys()):\n",
    "                storage_dict = run['results'][model_name][iteration_key][item_id]\n",
    "                logp_norm = {k: v - log_prob_mean for i, (k, v) in enumerate(storage_dict['logp'].items())}\n",
    "\n",
    "                def store_or_delete(storage_dict, key, value):\n",
    "                    if value is not None:\n",
    "                        assert np.isfinite(value), f\"Score for {model_name} {iteration_key} {item_id} is not finite: {value}\"\n",
    "                        storage_dict[key] = value\n",
    "                    elif key in storage_dict:\n",
    "                        del storage_dict[key]\n",
    "\n",
    "                extracted_rscores_norm = compute_ranked_score(logp_norm)\n",
    "                ranked_score_norm = compute_raw_score(extracted_rscores_norm)\n",
    "                store_or_delete(storage_dict, \"aggregated_score_ranked_norm\", ranked_score_norm)\n",
    "\n",
    "                extracted_rscores = compute_ranked_score(logp)\n",
    "                ranked_score = compute_raw_score(extracted_rscores)\n",
    "                store_or_delete(storage_dict, \"aggregated_score_ranked\", ranked_score)\n",
    "\n",
    "                extracted_wscore = compute_weighted_score(logp_norm)\n",
    "                weighted_score_norm = compute_raw_score(extracted_wscore)\n",
    "                store_or_delete(storage_dict, \"aggregated_score_weighted_norm\", weighted_score_norm)\n",
    "\n",
    "                run['results'][model_name][iteration_key][item_id] = storage_dict\n",
    "\n",
    "    return run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f5e72051",
   "metadata": {},
   "outputs": [],
   "source": [
    "# len(run.get('results', {}))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "05173e4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import display, Markdown\n",
    "\n",
    "\n",
    "def df2md(df):\n",
    "\n",
    "    # Round numeric columns and configure display for markdown\n",
    "    df_display = df.select([\n",
    "        pl.col(\"name\"),\n",
    "        *[pl.col(c).round(3) for c in df.columns if c != \"name\" and df[c].dtype in [pl.Float64, pl.Float32]]\n",
    "    ])\n",
    "\n",
    "    with pl.Config(\n",
    "        tbl_formatting=\"MARKDOWN\",\n",
    "        tbl_hide_column_data_types=True,\n",
    "        tbl_hide_dataframe_shape=True,\n",
    "        tbl_width_chars=240,  # Allow wider table\n",
    "        tbl_cols=-1,  # Show all columns\n",
    "    ) as cfg:\n",
    "        print(df_display)\n",
    "        s = str(df_display)\n",
    "    return s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b5b5b0d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Processing ../outputs/my_judgemark_runs.json...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c42ed9c80b524ec2b2ebbba2028bc12e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Recomputing scores with choice norm:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Error processing run 8039a352-2be5-43ca-9360-df818aa5f214__meta-llama_llama-3_2-3b-instruct: At least two samples are required; got 1.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing run 8039a352-2be5-43ca-9360-df818aa5f214__meta-llama_llama-3_2-3b-instruct...\n",
      "\n",
      "Processing run ab20b598-845b-4da7-9f4c-56ec05405e28__meta-llama_llama-3_2-3b-instruct...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.600\n",
      "CI99 Overlap pct: \n",
      "0.915\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.034\n",
      "Average EMD across all pairs: 0.162\n",
      "Avg. CI95 half-width: 0.130 (modulated: 0.231)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "23.922\n",
      "CI99 Overlap pct: \n",
      "0.916\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.036\n",
      "Average EMD across all pairs: 0.476\n",
      "Avg. CI95 half-width: 0.403 (modulated: 0.035)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.394347826086957, 7.170416666666666, 7.290909090909091, 7.494090909090909, 7.406363636363636, 7.224347826086957, 7.2669565217391305, 7.414166666666667, 7.310909090909091, 7.384761904761905, 7.469130434782609, 7.541428571428572, 7.461304347826087, 7.485217391304348, 7.8304347826086955, 7.137727272727273, 7.265833333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.5147058823529411\n",
      "k_tau (5.32518268176836, 4.366125662221411, 4.742696673531074, 5.496266233766234, 5.095086690802804, 4.951616137948563, 4.536200378071834, 5.082064075630252, 4.8599640449065005, 4.77386193607878, 5.105923853473337, 5.596322442020287, 5.452791853981669, 5.608248677542852, 6.489364346872965, 4.247853904407614, 4.663997609913531) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.6323529411764706\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 2.7756, p=0.0002\n",
      "Kruskal-Wallis: 51.2527, p=0.0000\n",
      "Pearson r=0.4689\n",
      "Kendall τ=0.3029\n",
      "Std.Dev across models: 0.1081\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 7.612 ±0.145\n",
      "gpt-4o-2024-11-20....................... 7.602 ±0.128\n",
      "claude-3-5-sonnet-20240620.............. 7.572 ±0.120\n",
      "gemini-1.5-pro-001...................... 7.510 ±0.131\n",
      "Llama-3-70b-chat-hf..................... 7.477 ±0.124\n",
      "gemini-1.5-pro-002...................... 7.434 ±0.120\n",
      "Mixtral-8x22B-Instruct-v0.1............. 7.418 ±0.138\n",
      "claude-3-haiku-20240307................. 7.408 ±0.147\n",
      "gemma-7b-it............................. 7.398 ±0.128\n",
      "Mistral-Large-Instruct-2411............. 7.391 ±0.140\n",
      "gemma-2b-it............................. 7.387 ±0.121\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 7.352 ±0.126\n",
      "claude-3-opus-20240229.................. 7.342 ±0.118\n",
      "c4ai-command-r-08-2024.................. 7.319 ±0.157\n",
      "Llama-2-13b-chat-hf..................... 7.283 ±0.124\n",
      "gpt-3.5-turbo-0125...................... 7.283 ±0.128\n",
      "databricks/dbrx-instruct................ 7.237 ±0.114\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 2.9541, p=0.0001\n",
      "Kruskal-Wallis: 51.2527, p=0.0000\n",
      "Pearson r=0.4784\n",
      "Kendall τ=0.3353\n",
      "Std.Dev across models: 0.3448\n",
      "\n",
      "Model Scores:\n",
      "gpt-4o-2024-11-20....................... 5.856 ±0.396\n",
      "DeepSeek-R1............................. 5.776 ±0.437\n",
      "claude-3-5-sonnet-20240620.............. 5.754 ±0.382\n",
      "gemini-1.5-pro-001...................... 5.485 ±0.411\n",
      "gemini-1.5-pro-002...................... 5.351 ±0.390\n",
      "Llama-3-70b-chat-hf..................... 5.349 ±0.395\n",
      "Mistral-Large-Instruct-2411............. 5.137 ±0.408\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.122 ±0.435\n",
      "claude-3-haiku-20240307................. 5.120 ±0.439\n",
      "gemma-2b-it............................. 5.112 ±0.399\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.101 ±0.378\n",
      "gemma-7b-it............................. 5.062 ±0.406\n",
      "claude-3-opus-20240229.................. 4.957 ±0.378\n",
      "c4ai-command-r-08-2024.................. 4.952 ±0.434\n",
      "gpt-3.5-turbo-0125...................... 4.819 ±0.402\n",
      "Llama-2-13b-chat-hf..................... 4.775 ±0.391\n",
      "databricks/dbrx-instruct................ 4.628 ±0.377\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.600\n",
      "CI99 Overlap pct: \n",
      "0.915\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.034\n",
      "Average EMD across all pairs: 0.162\n",
      "Avg. CI95 half-width: 0.130 (modulated: 0.231)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "23.922\n",
      "CI99 Overlap pct: \n",
      "0.916\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.036\n",
      "Average EMD across all pairs: 0.476\n",
      "Avg. CI95 half-width: 0.403 (modulated: 0.035)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.163\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.201 \n",
      "(0.3352941176470588)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 2.7756, p=0.0002\n",
      "Kruskal-Wallis: 51.2527, p=0.0000\n",
      "Pearson r=0.4689\n",
      "Kendall τ=0.3029\n",
      "Std.Dev across models: 0.1081\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 7.612 ±0.145\n",
      "gpt-4o-2024-11-20....................... 7.602 ±0.128\n",
      "claude-3-5-sonnet-20240620.............. 7.572 ±0.120\n",
      "gemini-1.5-pro-001...................... 7.510 ±0.131\n",
      "Llama-3-70b-chat-hf..................... 7.477 ±0.124\n",
      "gemini-1.5-pro-002...................... 7.434 ±0.120\n",
      "Mixtral-8x22B-Instruct-v0.1............. 7.418 ±0.138\n",
      "claude-3-haiku-20240307................. 7.408 ±0.147\n",
      "gemma-7b-it............................. 7.398 ±0.128\n",
      "Mistral-Large-Instruct-2411............. 7.391 ±0.140\n",
      "gemma-2b-it............................. 7.387 ±0.121\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 7.352 ±0.126\n",
      "claude-3-opus-20240229.................. 7.342 ±0.118\n",
      "c4ai-command-r-08-2024.................. 7.319 ±0.157\n",
      "Llama-2-13b-chat-hf..................... 7.283 ±0.124\n",
      "gpt-3.5-turbo-0125...................... 7.283 ±0.128\n",
      "databricks/dbrx-instruct................ 7.237 ±0.114\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 2.9541, p=0.0001\n",
      "Kruskal-Wallis: 51.2527, p=0.0000\n",
      "Pearson r=0.4784\n",
      "Kendall τ=0.3353\n",
      "Std.Dev across models: 0.3448\n",
      "\n",
      "Model Scores:\n",
      "gpt-4o-2024-11-20....................... 5.856 ±0.396\n",
      "DeepSeek-R1............................. 5.776 ±0.437\n",
      "claude-3-5-sonnet-20240620.............. 5.754 ±0.382\n",
      "gemini-1.5-pro-001...................... 5.485 ±0.411\n",
      "gemini-1.5-pro-002...................... 5.351 ±0.390\n",
      "Llama-3-70b-chat-hf..................... 5.349 ±0.395\n",
      "Mistral-Large-Instruct-2411............. 5.137 ±0.408\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.122 ±0.435\n",
      "claude-3-haiku-20240307................. 5.120 ±0.439\n",
      "gemma-2b-it............................. 5.112 ±0.399\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.101 ±0.378\n",
      "gemma-7b-it............................. 5.062 ±0.406\n",
      "claude-3-opus-20240229.................. 4.957 ±0.378\n",
      "c4ai-command-r-08-2024.................. 4.952 ±0.434\n",
      "gpt-3.5-turbo-0125...................... 4.819 ±0.402\n",
      "Llama-2-13b-chat-hf..................... 4.775 ±0.391\n",
      "databricks/dbrx-instruct................ 4.628 ±0.377\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.089\n",
      "Final Judgemark (cal)  = 0.102\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.434\n",
      "CI99 Overlap pct: \n",
      "0.917\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.033\n",
      "Average EMD across all pairs: 0.170\n",
      "Avg. CI95 half-width: 0.144 (modulated: 0.211)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "24.701\n",
      "CI99 Overlap pct: \n",
      "0.924\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.034\n",
      "Average EMD across all pairs: 0.458\n",
      "Avg. CI95 half-width: 0.415 (modulated: 0.025)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.49695652173913, 7.37375, 7.327727272727273, 7.600909090909091, 7.5613636363636365, 7.259130434782609, 7.366521739130435, 7.5125, 7.449545454545454, 7.444761904761905, 7.543913043478261, 7.605714285714286, 7.558695652173913, 7.660434782608696, 7.950434782608696, 7.205, 7.307083333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.5735294117647058\n",
      "k_tau (5.354307000871201, 4.78338200673005, 4.6573036784959765, 5.4964744974917235, 5.33343943539336, 4.791503446433092, 4.788533130312425, 5.273446096159618, 5.057543893481109, 4.906686420812179, 5.181487491605263, 5.531508629397641, 5.445050034559183, 5.851051163102335, 6.59915625600216, 4.24549639085251, 4.51276063915144) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.6029411764705882\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 2.5850, p=0.0005\n",
      "Kruskal-Wallis: 43.1496, p=0.0003\n",
      "Pearson r=0.4548\n",
      "Kendall τ=0.3059\n",
      "Std.Dev across models: 0.1154\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 7.730 ±0.160\n",
      "gpt-4o-2024-11-20....................... 7.723 ±0.149\n",
      "claude-3-5-sonnet-20240620.............. 7.645 ±0.134\n",
      "gemini-1.5-pro-001...................... 7.627 ±0.146\n",
      "Llama-3-70b-chat-hf..................... 7.571 ±0.131\n",
      "claude-3-haiku-20240307................. 7.543 ±0.164\n",
      "gemini-1.5-pro-002...................... 7.532 ±0.128\n",
      "gemma-7b-it............................. 7.528 ±0.139\n",
      "Mixtral-8x22B-Instruct-v0.1............. 7.518 ±0.156\n",
      "Mistral-Large-Instruct-2411............. 7.499 ±0.150\n",
      "gemma-2b-it............................. 7.475 ±0.137\n",
      "claude-3-opus-20240229.................. 7.424 ±0.134\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 7.421 ±0.138\n",
      "c4ai-command-r-08-2024.................. 7.404 ±0.168\n",
      "gpt-3.5-turbo-0125...................... 7.382 ±0.142\n",
      "Llama-2-13b-chat-hf..................... 7.374 ±0.141\n",
      "databricks/dbrx-instruct................ 7.330 ±0.130\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 2.5740, p=0.0006\n",
      "Kruskal-Wallis: 43.1496, p=0.0003\n",
      "Pearson r=0.4371\n",
      "Kendall τ=0.2912\n",
      "Std.Dev across models: 0.3311\n",
      "\n",
      "Model Scores:\n",
      "gpt-4o-2024-11-20....................... 5.916 ±0.419\n",
      "DeepSeek-R1............................. 5.833 ±0.447\n",
      "claude-3-5-sonnet-20240620.............. 5.665 ±0.402\n",
      "gemini-1.5-pro-001...................... 5.520 ±0.433\n",
      "Llama-3-70b-chat-hf..................... 5.401 ±0.391\n",
      "gemini-1.5-pro-002...................... 5.343 ±0.398\n",
      "claude-3-haiku-20240307................. 5.309 ±0.451\n",
      "gemma-7b-it............................. 5.289 ±0.408\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.235 ±0.449\n",
      "Mistral-Large-Instruct-2411............. 5.212 ±0.414\n",
      "gemma-2b-it............................. 5.114 ±0.408\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.086 ±0.386\n",
      "claude-3-opus-20240229.................. 5.003 ±0.393\n",
      "c4ai-command-r-08-2024.................. 4.988 ±0.441\n",
      "gpt-3.5-turbo-0125...................... 4.867 ±0.415\n",
      "Llama-2-13b-chat-hf..................... 4.818 ±0.407\n",
      "databricks/dbrx-instruct................ 4.713 ±0.391\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.434\n",
      "CI99 Overlap pct: \n",
      "0.917\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.033\n",
      "Average EMD across all pairs: 0.170\n",
      "Avg. CI95 half-width: 0.144 (modulated: 0.211)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "24.701\n",
      "CI99 Overlap pct: \n",
      "0.924\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.034\n",
      "Average EMD across all pairs: 0.458\n",
      "Avg. CI95 half-width: 0.415 (modulated: 0.025)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.165\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.158 \n",
      "(0.29117647058823526)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 2.5850, p=0.0005\n",
      "Kruskal-Wallis: 43.1496, p=0.0003\n",
      "Pearson r=0.4548\n",
      "Kendall τ=0.3059\n",
      "Std.Dev across models: 0.1154\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 7.730 ±0.160\n",
      "gpt-4o-2024-11-20....................... 7.723 ±0.149\n",
      "claude-3-5-sonnet-20240620.............. 7.645 ±0.134\n",
      "gemini-1.5-pro-001...................... 7.627 ±0.146\n",
      "Llama-3-70b-chat-hf..................... 7.571 ±0.131\n",
      "claude-3-haiku-20240307................. 7.543 ±0.164\n",
      "gemini-1.5-pro-002...................... 7.532 ±0.128\n",
      "gemma-7b-it............................. 7.528 ±0.139\n",
      "Mixtral-8x22B-Instruct-v0.1............. 7.518 ±0.156\n",
      "Mistral-Large-Instruct-2411............. 7.499 ±0.150\n",
      "gemma-2b-it............................. 7.475 ±0.137\n",
      "claude-3-opus-20240229.................. 7.424 ±0.134\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 7.421 ±0.138\n",
      "c4ai-command-r-08-2024.................. 7.404 ±0.168\n",
      "gpt-3.5-turbo-0125...................... 7.382 ±0.142\n",
      "Llama-2-13b-chat-hf..................... 7.374 ±0.141\n",
      "databricks/dbrx-instruct................ 7.330 ±0.130\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 2.5740, p=0.0006\n",
      "Kruskal-Wallis: 43.1496, p=0.0003\n",
      "Pearson r=0.4371\n",
      "Kendall τ=0.2912\n",
      "Std.Dev across models: 0.3311\n",
      "\n",
      "Model Scores:\n",
      "gpt-4o-2024-11-20....................... 5.916 ±0.419\n",
      "DeepSeek-R1............................. 5.833 ±0.447\n",
      "claude-3-5-sonnet-20240620.............. 5.665 ±0.402\n",
      "gemini-1.5-pro-001...................... 5.520 ±0.433\n",
      "Llama-3-70b-chat-hf..................... 5.401 ±0.391\n",
      "gemini-1.5-pro-002...................... 5.343 ±0.398\n",
      "claude-3-haiku-20240307................. 5.309 ±0.451\n",
      "gemma-7b-it............................. 5.289 ±0.408\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.235 ±0.449\n",
      "Mistral-Large-Instruct-2411............. 5.212 ±0.414\n",
      "gemma-2b-it............................. 5.114 ±0.408\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.086 ±0.386\n",
      "claude-3-opus-20240229.................. 5.003 ±0.393\n",
      "c4ai-command-r-08-2024.................. 4.988 ±0.441\n",
      "gpt-3.5-turbo-0125...................... 4.867 ±0.415\n",
      "Llama-2-13b-chat-hf..................... 4.818 ±0.407\n",
      "databricks/dbrx-instruct................ 4.713 ±0.391\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.088\n",
      "Final Judgemark (cal)  = 0.089\n",
      "\n",
      "/media/wassname/SGIronWolf/projects5/2025/judge/Judgemark-v2/judgemark_v2lp/scoring.py:159: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.\n",
      "  corr, _ = scipy.stats.pearsonr(lengths, scores)\n",
      "/media/wassname/SGIronWolf/projects5/2025/judge/Judgemark-v2/.venv/lib/python3.10/site-packages/scipy/stats/_axis_nan_policy.py:586: ConstantInputWarning: Each of the input arrays is constant; the F statistic is not defined or infinite\n",
      "  res = hypotest_fun_out(*samples, **kwds)\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 0.672\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.514\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (8.58, 9.48, 8.4, 8.0, 8.09, 9.08, 8.6, 9.68, 9.45, 8.85, 8.04, 9.42, 9.62, 8.44, 8.42, 9.23, 8.87) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) -0.2941176470588235\n",
      "k_tau (3.7441860465116292, 7.692307692307699, 2.8571428571428603, 0.0, 0.642857142857142, 5.80701754385965, 3.8372093023255807, 10.0, 7.346153846153839, 5.0, 0.28571428571427965, 7.0, 9.307692307692301, 3.0930232558139514, 3.0, 6.333333333333336, 5.07017543859649) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) -0.2941176470588235\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.4014\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 0.5534\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 9.680 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.620 ±0.000\n",
      "claude-3-haiku-20240307................. 9.480 ±0.000\n",
      "gemma-2b-it............................. 9.450 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.420 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 9.230 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 9.080 ±0.000\n",
      "databricks/dbrx-instruct................ 8.870 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 8.850 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 8.600 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.580 ±0.000\n",
      "gpt-4o-2024-11-20....................... 8.440 ±0.000\n",
      "DeepSeek-R1............................. 8.420 ±0.000\n",
      "claude-3-opus-20240229.................. 8.400 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 8.090 ±0.000\n",
      "c4ai-command-r-08-2024.................. 8.040 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.3828\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 2.9021\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 10.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.308 ±0.000\n",
      "claude-3-haiku-20240307................. 7.692 ±0.000\n",
      "gemma-2b-it............................. 7.346 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 6.333 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.807 ±0.000\n",
      "databricks/dbrx-instruct................ 5.070 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.000 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 3.837 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 3.744 ±0.000\n",
      "gpt-4o-2024-11-20....................... 3.093 ±0.000\n",
      "DeepSeek-R1............................. 3.000 ±0.000\n",
      "claude-3-opus-20240229.................. 2.857 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 0.643 ±0.000\n",
      "c4ai-command-r-08-2024.................. 0.286 ±0.000\n",
      "gemini-1.5-pro-001...................... 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 0.672\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.514\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.164\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 1.000 \n",
      "(-0.2941176470588235)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.4014\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 0.5534\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 9.680 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.620 ±0.000\n",
      "claude-3-haiku-20240307................. 9.480 ±0.000\n",
      "gemma-2b-it............................. 9.450 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.420 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 9.230 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 9.080 ±0.000\n",
      "databricks/dbrx-instruct................ 8.870 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 8.850 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 8.600 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.580 ±0.000\n",
      "gpt-4o-2024-11-20....................... 8.440 ±0.000\n",
      "DeepSeek-R1............................. 8.420 ±0.000\n",
      "claude-3-opus-20240229.................. 8.400 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 8.090 ±0.000\n",
      "c4ai-command-r-08-2024.................. 8.040 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.3828\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 2.9021\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 10.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.308 ±0.000\n",
      "claude-3-haiku-20240307................. 7.692 ±0.000\n",
      "gemma-2b-it............................. 7.346 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 6.333 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.807 ±0.000\n",
      "databricks/dbrx-instruct................ 5.070 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.000 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 3.837 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 3.744 ±0.000\n",
      "gpt-4o-2024-11-20....................... 3.093 ±0.000\n",
      "DeepSeek-R1............................. 3.000 ±0.000\n",
      "claude-3-opus-20240229.................. 2.857 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 0.643 ±0.000\n",
      "c4ai-command-r-08-2024.................. 0.286 ±0.000\n",
      "gemini-1.5-pro-001...................... 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.283\n",
      "Final Judgemark (cal)  = 0.709\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "14.315\n",
      "CI99 Overlap pct: \n",
      "0.920\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.035\n",
      "Average EMD across all pairs: 0.253\n",
      "Avg. CI95 half-width: 0.240 (modulated: 0.157)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "25.512\n",
      "CI99 Overlap pct: \n",
      "0.919\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.037\n",
      "Average EMD across all pairs: 0.445\n",
      "Avg. CI95 half-width: 0.427 (modulated: 0.018)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (6.010869565217392, 5.303333333333334, 6.055909090909091, 6.029090909090909, 5.775, 5.939565217391304, 5.89695652173913, 6.005833333333333, 5.462727272727273, 5.847142857142857, 6.115652173913044, 6.19952380952381, 6.196521739130435, 6.1408695652173915, 6.793478260869565, 5.9790909090909095, 5.6025) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.5147058823529411\n",
      "k_tau (5.050190145517103, 3.8384175436659156, 5.079622660935998, 5.081691035311112, 4.586936036289148, 4.869566096661701, 4.873835561898968, 5.109187984064588, 4.098156252592856, 4.760467081509795, 5.226464812683731, 5.395083882983372, 5.416759752043439, 5.355703532412083, 6.522080228169213, 4.991275010498031, 4.282586662905359) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.45588235294117646\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 2.1939, p=0.0041\n",
      "Kruskal-Wallis: 36.7729, p=0.0023\n",
      "Pearson r=0.4641\n",
      "Kendall τ=0.3324\n",
      "Std.Dev across models: 0.1761\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 6.346 ±0.206\n",
      "claude-3-5-sonnet-20240620.............. 6.223 ±0.209\n",
      "gpt-4o-2024-11-20....................... 6.195 ±0.226\n",
      "Llama-3-70b-chat-hf..................... 6.067 ±0.237\n",
      "gemini-1.5-pro-002...................... 6.040 ±0.232\n",
      "gemini-1.5-pro-001...................... 6.026 ±0.260\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.945 ±0.253\n",
      "Mistral-Large-Instruct-2411............. 5.921 ±0.241\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.912 ±0.234\n",
      "claude-3-opus-20240229.................. 5.912 ±0.249\n",
      "gpt-3.5-turbo-0125...................... 5.907 ±0.250\n",
      "gemma-2b-it............................. 5.899 ±0.239\n",
      "c4ai-command-r-08-2024.................. 5.872 ±0.248\n",
      "gemma-7b-it............................. 5.842 ±0.244\n",
      "Llama-2-13b-chat-hf..................... 5.756 ±0.239\n",
      "claude-3-haiku-20240307................. 5.751 ±0.263\n",
      "databricks/dbrx-instruct................ 5.626 ±0.243\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 2.3353, p=0.0020\n",
      "Kruskal-Wallis: 36.7729, p=0.0023\n",
      "Pearson r=0.4830\n",
      "Kendall τ=0.3382\n",
      "Std.Dev across models: 0.3237\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 5.700 ±0.373\n",
      "claude-3-5-sonnet-20240620.............. 5.452 ±0.381\n",
      "gpt-4o-2024-11-20....................... 5.441 ±0.406\n",
      "gemini-1.5-pro-002...................... 5.128 ±0.416\n",
      "Llama-3-70b-chat-hf..................... 5.128 ±0.428\n",
      "gemini-1.5-pro-001...................... 5.068 ±0.457\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.965 ±0.447\n",
      "Mistral-Large-Instruct-2411............. 4.914 ±0.429\n",
      "claude-3-opus-20240229.................. 4.910 ±0.437\n",
      "gpt-3.5-turbo-0125...................... 4.900 ±0.442\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.882 ±0.420\n",
      "gemma-2b-it............................. 4.848 ±0.426\n",
      "c4ai-command-r-08-2024.................. 4.808 ±0.442\n",
      "gemma-7b-it............................. 4.797 ±0.434\n",
      "claude-3-haiku-20240307................. 4.610 ±0.462\n",
      "Llama-2-13b-chat-hf..................... 4.605 ±0.426\n",
      "databricks/dbrx-instruct................ 4.352 ±0.436\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "14.315\n",
      "CI99 Overlap pct: \n",
      "0.920\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.035\n",
      "Average EMD across all pairs: 0.253\n",
      "Avg. CI95 half-width: 0.240 (modulated: 0.157)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "25.512\n",
      "CI99 Overlap pct: \n",
      "0.919\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.037\n",
      "Average EMD across all pairs: 0.445\n",
      "Avg. CI95 half-width: 0.427 (modulated: 0.018)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.165\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.195 \n",
      "(0.338235294117647)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 2.1939, p=0.0041\n",
      "Kruskal-Wallis: 36.7729, p=0.0023\n",
      "Pearson r=0.4641\n",
      "Kendall τ=0.3324\n",
      "Std.Dev across models: 0.1761\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 6.346 ±0.206\n",
      "claude-3-5-sonnet-20240620.............. 6.223 ±0.209\n",
      "gpt-4o-2024-11-20....................... 6.195 ±0.226\n",
      "Llama-3-70b-chat-hf..................... 6.067 ±0.237\n",
      "gemini-1.5-pro-002...................... 6.040 ±0.232\n",
      "gemini-1.5-pro-001...................... 6.026 ±0.260\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.945 ±0.253\n",
      "Mistral-Large-Instruct-2411............. 5.921 ±0.241\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.912 ±0.234\n",
      "claude-3-opus-20240229.................. 5.912 ±0.249\n",
      "gpt-3.5-turbo-0125...................... 5.907 ±0.250\n",
      "gemma-2b-it............................. 5.899 ±0.239\n",
      "c4ai-command-r-08-2024.................. 5.872 ±0.248\n",
      "gemma-7b-it............................. 5.842 ±0.244\n",
      "Llama-2-13b-chat-hf..................... 5.756 ±0.239\n",
      "claude-3-haiku-20240307................. 5.751 ±0.263\n",
      "databricks/dbrx-instruct................ 5.626 ±0.243\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 2.3353, p=0.0020\n",
      "Kruskal-Wallis: 36.7729, p=0.0023\n",
      "Pearson r=0.4830\n",
      "Kendall τ=0.3382\n",
      "Std.Dev across models: 0.3237\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 5.700 ±0.373\n",
      "claude-3-5-sonnet-20240620.............. 5.452 ±0.381\n",
      "gpt-4o-2024-11-20....................... 5.441 ±0.406\n",
      "gemini-1.5-pro-002...................... 5.128 ±0.416\n",
      "Llama-3-70b-chat-hf..................... 5.128 ±0.428\n",
      "gemini-1.5-pro-001...................... 5.068 ±0.457\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.965 ±0.447\n",
      "Mistral-Large-Instruct-2411............. 4.914 ±0.429\n",
      "claude-3-opus-20240229.................. 4.910 ±0.437\n",
      "gpt-3.5-turbo-0125...................... 4.900 ±0.442\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.882 ±0.420\n",
      "gemma-2b-it............................. 4.848 ±0.426\n",
      "c4ai-command-r-08-2024.................. 4.808 ±0.442\n",
      "gemma-7b-it............................. 4.797 ±0.434\n",
      "claude-3-haiku-20240307................. 4.610 ±0.462\n",
      "Llama-2-13b-chat-hf..................... 4.605 ±0.426\n",
      "databricks/dbrx-instruct................ 4.352 ±0.436\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.094\n",
      "Final Judgemark (cal)  = 0.099\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 0.672\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.514\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (8.58, 9.48, 8.4, 8.0, 8.09, 9.08, 8.6, 9.68, 9.45, 8.85, 8.04, 9.42, 9.62, 8.44, 8.42, 9.23, 8.87) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) -0.2941176470588235\n",
      "k_tau (3.7441860465116292, 7.692307692307699, 2.8571428571428603, 0.0, 0.642857142857142, 5.80701754385965, 3.8372093023255807, 10.0, 7.346153846153839, 5.0, 0.28571428571427965, 7.0, 9.307692307692301, 3.0930232558139514, 3.0, 6.333333333333336, 5.07017543859649) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) -0.2941176470588235\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.4014\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 0.5534\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 9.680 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.620 ±0.000\n",
      "claude-3-haiku-20240307................. 9.480 ±0.000\n",
      "gemma-2b-it............................. 9.450 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.420 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 9.230 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 9.080 ±0.000\n",
      "databricks/dbrx-instruct................ 8.870 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 8.850 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 8.600 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.580 ±0.000\n",
      "gpt-4o-2024-11-20....................... 8.440 ±0.000\n",
      "DeepSeek-R1............................. 8.420 ±0.000\n",
      "claude-3-opus-20240229.................. 8.400 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 8.090 ±0.000\n",
      "c4ai-command-r-08-2024.................. 8.040 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.3828\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 2.9021\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 10.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.308 ±0.000\n",
      "claude-3-haiku-20240307................. 7.692 ±0.000\n",
      "gemma-2b-it............................. 7.346 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 6.333 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.807 ±0.000\n",
      "databricks/dbrx-instruct................ 5.070 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.000 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 3.837 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 3.744 ±0.000\n",
      "gpt-4o-2024-11-20....................... 3.093 ±0.000\n",
      "DeepSeek-R1............................. 3.000 ±0.000\n",
      "claude-3-opus-20240229.................. 2.857 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 0.643 ±0.000\n",
      "c4ai-command-r-08-2024.................. 0.286 ±0.000\n",
      "gemini-1.5-pro-001...................... 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 0.672\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.514\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.165\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 1.000 \n",
      "(-0.2941176470588235)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.4014\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 0.5534\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 9.680 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.620 ±0.000\n",
      "claude-3-haiku-20240307................. 9.480 ±0.000\n",
      "gemma-2b-it............................. 9.450 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.420 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 9.230 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 9.080 ±0.000\n",
      "databricks/dbrx-instruct................ 8.870 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 8.850 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 8.600 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.580 ±0.000\n",
      "gpt-4o-2024-11-20....................... 8.440 ±0.000\n",
      "DeepSeek-R1............................. 8.420 ±0.000\n",
      "claude-3-opus-20240229.................. 8.400 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 8.090 ±0.000\n",
      "c4ai-command-r-08-2024.................. 8.040 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=-0.3828\n",
      "Kendall τ=-0.2941\n",
      "Std.Dev across models: 2.9021\n",
      "\n",
      "Model Scores:\n",
      "gemma-7b-it............................. 10.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 9.308 ±0.000\n",
      "claude-3-haiku-20240307................. 7.692 ±0.000\n",
      "gemma-2b-it............................. 7.346 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 6.333 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.807 ±0.000\n",
      "databricks/dbrx-instruct................ 5.070 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.000 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 3.837 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 3.744 ±0.000\n",
      "gpt-4o-2024-11-20....................... 3.093 ±0.000\n",
      "DeepSeek-R1............................. 3.000 ±0.000\n",
      "claude-3-opus-20240229.................. 2.857 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 0.643 ±0.000\n",
      "c4ai-command-r-08-2024.................. 0.286 ±0.000\n",
      "gemini-1.5-pro-001...................... 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.283\n",
      "Final Judgemark (cal)  = 0.709\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "5.877\n",
      "CI99 Overlap pct: \n",
      "0.896\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.012\n",
      "Average EMD across all pairs: 0.135\n",
      "Avg. CI95 half-width: 0.106 (modulated: 0.087)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "23.356\n",
      "CI99 Overlap pct: \n",
      "0.936\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.023\n",
      "Average EMD across all pairs: 0.468\n",
      "Avg. CI95 half-width: 0.395 (modulated: 0.027)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (4.7669565217391305, 4.73375, 4.8545454545454545, 4.856363636363636, 4.885454545454546, 5.011304347826087, 5.017826086956521, 4.8575, 4.884090909090909, 5.019047619047619, 4.932608695652174, 4.835238095238095, 4.980869565217391, 4.84, 4.968260869565217, 4.793181818181818, 4.810833333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) -0.10294117647058822\n",
      "k_tau (4.728394744625539, 4.316466016231055, 4.774279855482863, 4.9538139179116625, 4.977688539530645, 5.11049206868102, 5.604746337894426, 5.078467746747822, 4.576428896917618, 5.1108093362658344, 5.236884135805353, 5.004356128416279, 5.55380454241193, 4.964332414888151, 5.54026603776522, 4.79348045438271, 4.168156253729562) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.1176470588235294\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 0.3384, p=0.9932\n",
      "Kruskal-Wallis: 14.4745, p=0.5634\n",
      "Pearson r=-0.0647\n",
      "Kendall τ=-0.0485\n",
      "Std.Dev across models: 0.0312\n",
      "\n",
      "Model Scores:\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.906 ±0.125\n",
      "Llama-3-70b-chat-hf..................... 4.895 ±0.090\n",
      "claude-3-opus-20240229.................. 4.886 ±0.117\n",
      "DeepSeek-R1............................. 4.885 ±0.088\n",
      "databricks/dbrx-instruct................ 4.876 ±0.161\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.873 ±0.126\n",
      "Llama-2-13b-chat-hf..................... 4.871 ±0.120\n",
      "gpt-3.5-turbo-0125...................... 4.857 ±0.115\n",
      "claude-3-haiku-20240307................. 4.855 ±0.111\n",
      "gemma-7b-it............................. 4.844 ±0.081\n",
      "c4ai-command-r-08-2024.................. 4.832 ±0.125\n",
      "Mistral-Large-Instruct-2411............. 4.832 ±0.080\n",
      "gemini-1.5-pro-001...................... 4.827 ±0.093\n",
      "gemma-2b-it............................. 4.823 ±0.111\n",
      "gpt-4o-2024-11-20....................... 4.822 ±0.082\n",
      "claude-3-5-sonnet-20240620.............. 4.805 ±0.087\n",
      "gemini-1.5-pro-002...................... 4.801 ±0.086\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 0.9463, p=0.5147\n",
      "Kruskal-Wallis: 14.4745, p=0.5634\n",
      "Pearson r=0.2421\n",
      "Kendall τ=0.1294\n",
      "Std.Dev across models: 0.1922\n",
      "\n",
      "Model Scores:\n",
      "Llama-3-70b-chat-hf..................... 5.245 ±0.336\n",
      "DeepSeek-R1............................. 5.241 ±0.347\n",
      "gemma-7b-it............................. 5.093 ±0.333\n",
      "gpt-4o-2024-11-20....................... 5.038 ±0.345\n",
      "Mistral-Large-Instruct-2411............. 5.019 ±0.333\n",
      "claude-3-opus-20240229.................. 4.927 ±0.416\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.921 ±0.441\n",
      "gpt-3.5-turbo-0125...................... 4.912 ±0.412\n",
      "gemini-1.5-pro-002...................... 4.890 ±0.365\n",
      "claude-3-5-sonnet-20240620.............. 4.888 ±0.332\n",
      "gemini-1.5-pro-001...................... 4.877 ±0.376\n",
      "Llama-2-13b-chat-hf..................... 4.869 ±0.440\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.845 ±0.418\n",
      "claude-3-haiku-20240307................. 4.811 ±0.399\n",
      "gemma-2b-it............................. 4.689 ±0.409\n",
      "c4ai-command-r-08-2024.................. 4.630 ±0.474\n",
      "databricks/dbrx-instruct................ 4.462 ±0.531\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "5.877\n",
      "CI99 Overlap pct: \n",
      "0.896\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.012\n",
      "Average EMD across all pairs: 0.135\n",
      "Avg. CI95 half-width: 0.106 (modulated: 0.087)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 1.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "23.356\n",
      "CI99 Overlap pct: \n",
      "0.936\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.023\n",
      "Average EMD across all pairs: 0.468\n",
      "Avg. CI95 half-width: 0.395 (modulated: 0.027)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.164\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.084 \n",
      "(0.12941176470588234)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 0.3384, p=0.9932\n",
      "Kruskal-Wallis: 14.4745, p=0.5634\n",
      "Pearson r=-0.0647\n",
      "Kendall τ=-0.0485\n",
      "Std.Dev across models: 0.0312\n",
      "\n",
      "Model Scores:\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.906 ±0.125\n",
      "Llama-3-70b-chat-hf..................... 4.895 ±0.090\n",
      "claude-3-opus-20240229.................. 4.886 ±0.117\n",
      "DeepSeek-R1............................. 4.885 ±0.088\n",
      "databricks/dbrx-instruct................ 4.876 ±0.161\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.873 ±0.126\n",
      "Llama-2-13b-chat-hf..................... 4.871 ±0.120\n",
      "gpt-3.5-turbo-0125...................... 4.857 ±0.115\n",
      "claude-3-haiku-20240307................. 4.855 ±0.111\n",
      "gemma-7b-it............................. 4.844 ±0.081\n",
      "c4ai-command-r-08-2024.................. 4.832 ±0.125\n",
      "Mistral-Large-Instruct-2411............. 4.832 ±0.080\n",
      "gemini-1.5-pro-001...................... 4.827 ±0.093\n",
      "gemma-2b-it............................. 4.823 ±0.111\n",
      "gpt-4o-2024-11-20....................... 4.822 ±0.082\n",
      "claude-3-5-sonnet-20240620.............. 4.805 ±0.087\n",
      "gemini-1.5-pro-002...................... 4.801 ±0.086\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 0.9463, p=0.5147\n",
      "Kruskal-Wallis: 14.4745, p=0.5634\n",
      "Pearson r=0.2421\n",
      "Kendall τ=0.1294\n",
      "Std.Dev across models: 0.1922\n",
      "\n",
      "Model Scores:\n",
      "Llama-3-70b-chat-hf..................... 5.245 ±0.336\n",
      "DeepSeek-R1............................. 5.241 ±0.347\n",
      "gemma-7b-it............................. 5.093 ±0.333\n",
      "gpt-4o-2024-11-20....................... 5.038 ±0.345\n",
      "Mistral-Large-Instruct-2411............. 5.019 ±0.333\n",
      "claude-3-opus-20240229.................. 4.927 ±0.416\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.921 ±0.441\n",
      "gpt-3.5-turbo-0125...................... 4.912 ±0.412\n",
      "gemini-1.5-pro-002...................... 4.890 ±0.365\n",
      "claude-3-5-sonnet-20240620.............. 4.888 ±0.332\n",
      "gemini-1.5-pro-001...................... 4.877 ±0.376\n",
      "Llama-2-13b-chat-hf..................... 4.869 ±0.440\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.845 ±0.418\n",
      "claude-3-haiku-20240307................. 4.811 ±0.399\n",
      "gemma-2b-it............................. 4.689 ±0.409\n",
      "c4ai-command-r-08-2024.................. 4.630 ±0.474\n",
      "databricks/dbrx-instruct................ 4.462 ±0.531\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.028\n",
      "Final Judgemark (cal)  = 0.046\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving results to ../outputs/my_judgemark_runs_ab20b598-845b-4da7-9f4c-56ec05405e28__meta-llama_llama-3_2-3b-instruct.parquet\n",
      "| name          | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | human_correlation | human_correlation_calib |\n",
      "|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|\n",
      "| weighted      | 0.089           | 0.102                 | 0.0       | 0.0             | 0.077        | 0.087              | 0.225             | 0.261                   |\n",
      "| raw           | 0.088           | 0.089                 | 0.0       | 0.0             | 0.074        | 0.081              | 0.229             | 0.212                   |\n",
      "| ranked        | 0.283           | 0.709                 | 0.0       | 1.0             | 0.425        | 0.813              | 0.0               | 0.0                     |\n",
      "| ranked_norm   | 0.094           | 0.099                 | 0.0       | 0.0             | 0.077        | 0.082              | 0.258             | 0.265                   |\n",
      "| ranked_scaled | 0.283           | 0.709                 | 0.0       | 1.0             | 0.425        | 0.813              | 0.0               | 0.0                     |\n",
      "| weighted_norm | 0.028           | 0.046                 | 0.0       | 0.0             | 0.042        | 0.061              | 0.0               | 0.033                   |\n",
      "| name          | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | human_correlation | human_correlation_calib |\n",
      "|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|\n",
      "| weighted      | 0.089           | 0.102                 | 0.0       | 0.0             | 0.077        | 0.087              | 0.225             | 0.261                   |\n",
      "| raw           | 0.088           | 0.089                 | 0.0       | 0.0             | 0.074        | 0.081              | 0.229             | 0.212                   |\n",
      "| ranked        | 0.283           | 0.709                 | 0.0       | 1.0             | 0.425        | 0.813              | 0.0               | 0.0                     |\n",
      "| ranked_norm   | 0.094           | 0.099                 | 0.0       | 0.0             | 0.077        | 0.082              | 0.258             | 0.265                   |\n",
      "| ranked_scaled | 0.283           | 0.709                 | 0.0       | 1.0             | 0.425        | 0.813              | 0.0               | 0.0                     |\n",
      "| weighted_norm | 0.028           | 0.046                 | 0.0       | 0.0             | 0.042        | 0.061              | 0.0               | 0.033                   |\n",
      "\n",
      "Run ID: ab20b598-845b-4da7-9f4c-56ec05405e28__meta-llama_llama-3_2-3b-instruct\n",
      "\n",
      "\n",
      "Results saved to ../outputs/my_judgemark_runs_ab20b598-845b-4da7-9f4c-56ec05405e28__meta-llama_llama-3_2-3b-instruct.parquet and ../outputs/my_judgemark_runs.md\n",
      "\n",
      "\n",
      "Processing ../outputs/my_judgemark_runs_20250728_184232.json...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "733428a3ddfd4611864bef9464c2f9ef",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Recomputing scores with choice norm:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing run db38b659-de7d-4bda-a749-86f9bea79dcf__qwen_qwen3-235b-a22b...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Error processing run db38b659-de7d-4bda-a749-86f9bea79dcf__qwen_qwen3-235b-a22b: 'logp'\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Processing ../outputs/my_judgemark_runs_20250728_184300.json...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4b9147f2e3414a0c90530cede3381000",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Recomputing scores with choice norm:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing run 3c750455-b8f0-485b-9ed8-9dd8b689e219__deepseek_deepseek-r1...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.773\n",
      "CI99 Overlap pct: \n",
      "0.424\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.318\n",
      "Average EMD across all pairs: 1.334\n",
      "Avg. CI95 half-width: 0.140 (modulated: 0.833)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.078\n",
      "CI99 Overlap pct: \n",
      "0.431\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.310\n",
      "Average EMD across all pairs: 2.183\n",
      "Avg. CI95 half-width: 0.224 (modulated: 0.604)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.0808333333333335, 5.87625, 6.635, 7.180833333333333, 5.866666666666667, 5.3025, 4.8875, 4.43, 3.9030434782608694, 5.773333333333333, 5.466666666666667, 7.305, 5.864166666666667, 7.527083333333334, 8.100416666666666, 4.836666666666667, 5.0529166666666665) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8970588235294118\n",
      "k_tau (6.951754223152208, 4.970476620729527, 6.273177331840759, 7.17187232376618, 4.928121138576916, 3.937711758999844, 3.2270374377404214, 2.649601392780112, 1.9500847247478919, 4.756880110894291, 4.213826212394102, 7.328561990750936, 4.963653738500655, 7.596515405756703, 8.370669089040593, 3.161911209225535, 3.5256337830476054) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.9117647058823529\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 246.9409, p=0.0000\n",
      "Kruskal-Wallis: 1340.4629, p=0.0000\n",
      "Pearson r=0.9651\n",
      "Kendall τ=0.9118\n",
      "Std.Dev across models: 1.0962\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.009 ±0.130\n",
      "gpt-4o-2024-11-20....................... 7.515 ±0.128\n",
      "gemini-1.5-pro-002...................... 7.168 ±0.133\n",
      "claude-3-5-sonnet-20240620.............. 7.120 ±0.129\n",
      "gemini-1.5-pro-001...................... 7.025 ±0.113\n",
      "claude-3-opus-20240229.................. 6.502 ±0.156\n",
      "Llama-3-70b-chat-hf..................... 5.970 ±0.135\n",
      "claude-3-haiku-20240307................. 5.818 ±0.141\n",
      "Mistral-Large-Instruct-2411............. 5.813 ±0.193\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.626 ±0.155\n",
      "c4ai-command-r-08-2024.................. 5.458 ±0.124\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.313 ±0.136\n",
      "databricks/dbrx-instruct................ 5.067 ±0.166\n",
      "gpt-3.5-turbo-0125...................... 4.871 ±0.110\n",
      "Llama-2-13b-chat-hf..................... 4.790 ±0.132\n",
      "gemma-7b-it............................. 4.585 ±0.141\n",
      "gemma-2b-it............................. 4.081 ±0.150\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 251.5504, p=0.0000\n",
      "Kruskal-Wallis: 1340.4629, p=0.0000\n",
      "Pearson r=0.9610\n",
      "Kendall τ=0.9118\n",
      "Std.Dev across models: 1.7917\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.236 ±0.185\n",
      "gpt-4o-2024-11-20....................... 7.595 ±0.188\n",
      "gemini-1.5-pro-002...................... 7.098 ±0.206\n",
      "claude-3-5-sonnet-20240620.............. 7.035 ±0.197\n",
      "gemini-1.5-pro-001...................... 6.907 ±0.174\n",
      "claude-3-opus-20240229.................. 6.018 ±0.259\n",
      "Llama-3-70b-chat-hf..................... 5.122 ±0.236\n",
      "Mistral-Large-Instruct-2411............. 4.856 ±0.329\n",
      "claude-3-haiku-20240307................. 4.846 ±0.252\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.510 ±0.277\n",
      "c4ai-command-r-08-2024.................. 4.182 ±0.227\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.961 ±0.239\n",
      "databricks/dbrx-instruct................ 3.562 ±0.264\n",
      "gpt-3.5-turbo-0125...................... 3.180 ±0.174\n",
      "Llama-2-13b-chat-hf..................... 3.091 ±0.204\n",
      "gemma-7b-it............................. 2.792 ±0.206\n",
      "gemma-2b-it............................. 2.136 ±0.200\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.773\n",
      "CI99 Overlap pct: \n",
      "0.424\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.318\n",
      "Average EMD across all pairs: 1.334\n",
      "Avg. CI95 half-width: 0.140 (modulated: 0.833)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.078\n",
      "CI99 Overlap pct: \n",
      "0.431\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.310\n",
      "Average EMD across all pairs: 2.183\n",
      "Avg. CI95 half-width: 0.224 (modulated: 0.604)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.937\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.923 \n",
      "(0.9117647058823529)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 246.9409, p=0.0000\n",
      "Kruskal-Wallis: 1340.4629, p=0.0000\n",
      "Pearson r=0.9651\n",
      "Kendall τ=0.9118\n",
      "Std.Dev across models: 1.0962\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.009 ±0.130\n",
      "gpt-4o-2024-11-20....................... 7.515 ±0.128\n",
      "gemini-1.5-pro-002...................... 7.168 ±0.133\n",
      "claude-3-5-sonnet-20240620.............. 7.120 ±0.129\n",
      "gemini-1.5-pro-001...................... 7.025 ±0.113\n",
      "claude-3-opus-20240229.................. 6.502 ±0.156\n",
      "Llama-3-70b-chat-hf..................... 5.970 ±0.135\n",
      "claude-3-haiku-20240307................. 5.818 ±0.141\n",
      "Mistral-Large-Instruct-2411............. 5.813 ±0.193\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.626 ±0.155\n",
      "c4ai-command-r-08-2024.................. 5.458 ±0.124\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.313 ±0.136\n",
      "databricks/dbrx-instruct................ 5.067 ±0.166\n",
      "gpt-3.5-turbo-0125...................... 4.871 ±0.110\n",
      "Llama-2-13b-chat-hf..................... 4.790 ±0.132\n",
      "gemma-7b-it............................. 4.585 ±0.141\n",
      "gemma-2b-it............................. 4.081 ±0.150\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 251.5504, p=0.0000\n",
      "Kruskal-Wallis: 1340.4629, p=0.0000\n",
      "Pearson r=0.9610\n",
      "Kendall τ=0.9118\n",
      "Std.Dev across models: 1.7917\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.236 ±0.185\n",
      "gpt-4o-2024-11-20....................... 7.595 ±0.188\n",
      "gemini-1.5-pro-002...................... 7.098 ±0.206\n",
      "claude-3-5-sonnet-20240620.............. 7.035 ±0.197\n",
      "gemini-1.5-pro-001...................... 6.907 ±0.174\n",
      "claude-3-opus-20240229.................. 6.018 ±0.259\n",
      "Llama-3-70b-chat-hf..................... 5.122 ±0.236\n",
      "Mistral-Large-Instruct-2411............. 4.856 ±0.329\n",
      "claude-3-haiku-20240307................. 4.846 ±0.252\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.510 ±0.277\n",
      "c4ai-command-r-08-2024.................. 4.182 ±0.227\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.961 ±0.239\n",
      "databricks/dbrx-instruct................ 3.562 ±0.264\n",
      "gpt-3.5-turbo-0125...................... 3.180 ±0.174\n",
      "Llama-2-13b-chat-hf..................... 3.091 ±0.204\n",
      "gemma-7b-it............................. 2.792 ±0.206\n",
      "gemma-2b-it............................. 2.136 ±0.200\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.666\n",
      "Final Judgemark (cal)  = 0.714\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.514\n",
      "CI99 Overlap pct: \n",
      "0.406\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.340\n",
      "Average EMD across all pairs: 1.361\n",
      "Avg. CI95 half-width: 0.134 (modulated: 0.849)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.505\n",
      "CI99 Overlap pct: \n",
      "0.410\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.337\n",
      "Average EMD across all pairs: 2.224\n",
      "Avg. CI95 half-width: 0.214 (modulated: 0.636)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.2075, 5.891666666666667, 6.645833333333333, 7.17, 5.872083333333333, 5.295416666666667, 4.9225, 4.42375, 3.9130434782608696, 5.767916666666666, 5.505833333333333, 7.350416666666667, 5.902083333333334, 7.7004166666666665, 8.099166666666667, 4.839166666666666, 5.04625) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.9117647058823529\n",
      "k_tau (7.116312056737589, 4.941891289390237, 6.232299054373523, 7.107176996608079, 4.875130409086237, 3.880235484714111, 3.2264773280337447, 2.6151016881173845, 1.9454358641574738, 4.691218682067615, 4.2111636112913695, 7.342314215232809, 4.98624091614085, 7.830943570767808, 8.34094973789701, 3.125561954719158, 3.4652997467373283) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.9117647058823529\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 275.3132, p=0.0000\n",
      "Kruskal-Wallis: 1397.5808, p=0.0000\n",
      "Pearson r=0.9663\n",
      "Kendall τ=0.9118\n",
      "Std.Dev across models: 1.1194\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.078 ±0.108\n",
      "gpt-4o-2024-11-20....................... 7.603 ±0.107\n",
      "gemini-1.5-pro-002...................... 7.237 ±0.114\n",
      "claude-3-5-sonnet-20240620.............. 7.197 ±0.115\n",
      "gemini-1.5-pro-001...................... 7.042 ±0.113\n",
      "claude-3-opus-20240229.................. 6.534 ±0.150\n",
      "Llama-3-70b-chat-hf..................... 6.039 ±0.129\n",
      "Mistral-Large-Instruct-2411............. 5.857 ±0.189\n",
      "claude-3-haiku-20240307................. 5.822 ±0.143\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.636 ±0.155\n",
      "c4ai-command-r-08-2024.................. 5.477 ±0.126\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.327 ±0.136\n",
      "databricks/dbrx-instruct................ 5.065 ±0.167\n",
      "gpt-3.5-turbo-0125...................... 4.879 ±0.109\n",
      "Llama-2-13b-chat-hf..................... 4.797 ±0.132\n",
      "gemma-7b-it............................. 4.585 ±0.143\n",
      "gemma-2b-it............................. 4.097 ±0.151\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 283.9974, p=0.0000\n",
      "Kruskal-Wallis: 1397.5808, p=0.0000\n",
      "Pearson r=0.9624\n",
      "Kendall τ=0.9088\n",
      "Std.Dev across models: 1.8273\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.316 ±0.141\n",
      "gpt-4o-2024-11-20....................... 7.692 ±0.146\n",
      "gemini-1.5-pro-002...................... 7.165 ±0.173\n",
      "claude-3-5-sonnet-20240620.............. 7.107 ±0.170\n",
      "gemini-1.5-pro-001...................... 6.881 ±0.176\n",
      "claude-3-opus-20240229.................. 6.019 ±0.248\n",
      "Llama-3-70b-chat-hf..................... 5.178 ±0.229\n",
      "Mistral-Large-Instruct-2411............. 4.879 ±0.321\n",
      "claude-3-haiku-20240307................. 4.801 ±0.254\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.476 ±0.275\n",
      "c4ai-command-r-08-2024.................. 4.158 ±0.229\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.933 ±0.237\n",
      "databricks/dbrx-instruct................ 3.517 ±0.262\n",
      "gpt-3.5-turbo-0125...................... 3.146 ±0.169\n",
      "Llama-2-13b-chat-hf..................... 3.063 ±0.201\n",
      "gemma-7b-it............................. 2.762 ±0.205\n",
      "gemma-2b-it............................. 2.135 ±0.196\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.514\n",
      "CI99 Overlap pct: \n",
      "0.406\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.340\n",
      "Average EMD across all pairs: 1.361\n",
      "Avg. CI95 half-width: 0.134 (modulated: 0.849)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.505\n",
      "CI99 Overlap pct: \n",
      "0.410\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.337\n",
      "Average EMD across all pairs: 2.224\n",
      "Avg. CI95 half-width: 0.214 (modulated: 0.636)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.937\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.935 \n",
      "(0.9088235294117647)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 275.3132, p=0.0000\n",
      "Kruskal-Wallis: 1397.5808, p=0.0000\n",
      "Pearson r=0.9663\n",
      "Kendall τ=0.9118\n",
      "Std.Dev across models: 1.1194\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.078 ±0.108\n",
      "gpt-4o-2024-11-20....................... 7.603 ±0.107\n",
      "gemini-1.5-pro-002...................... 7.237 ±0.114\n",
      "claude-3-5-sonnet-20240620.............. 7.197 ±0.115\n",
      "gemini-1.5-pro-001...................... 7.042 ±0.113\n",
      "claude-3-opus-20240229.................. 6.534 ±0.150\n",
      "Llama-3-70b-chat-hf..................... 6.039 ±0.129\n",
      "Mistral-Large-Instruct-2411............. 5.857 ±0.189\n",
      "claude-3-haiku-20240307................. 5.822 ±0.143\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.636 ±0.155\n",
      "c4ai-command-r-08-2024.................. 5.477 ±0.126\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.327 ±0.136\n",
      "databricks/dbrx-instruct................ 5.065 ±0.167\n",
      "gpt-3.5-turbo-0125...................... 4.879 ±0.109\n",
      "Llama-2-13b-chat-hf..................... 4.797 ±0.132\n",
      "gemma-7b-it............................. 4.585 ±0.143\n",
      "gemma-2b-it............................. 4.097 ±0.151\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 283.9974, p=0.0000\n",
      "Kruskal-Wallis: 1397.5808, p=0.0000\n",
      "Pearson r=0.9624\n",
      "Kendall τ=0.9088\n",
      "Std.Dev across models: 1.8273\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.316 ±0.141\n",
      "gpt-4o-2024-11-20....................... 7.692 ±0.146\n",
      "gemini-1.5-pro-002...................... 7.165 ±0.173\n",
      "claude-3-5-sonnet-20240620.............. 7.107 ±0.170\n",
      "gemini-1.5-pro-001...................... 6.881 ±0.176\n",
      "claude-3-opus-20240229.................. 6.019 ±0.248\n",
      "Llama-3-70b-chat-hf..................... 5.178 ±0.229\n",
      "Mistral-Large-Instruct-2411............. 4.879 ±0.321\n",
      "claude-3-haiku-20240307................. 4.801 ±0.254\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.476 ±0.275\n",
      "c4ai-command-r-08-2024.................. 4.158 ±0.229\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.933 ±0.237\n",
      "databricks/dbrx-instruct................ 3.517 ±0.262\n",
      "gpt-3.5-turbo-0125...................... 3.146 ±0.169\n",
      "Llama-2-13b-chat-hf..................... 3.063 ±0.201\n",
      "gemma-7b-it............................. 2.762 ±0.205\n",
      "gemma-2b-it............................. 2.135 ±0.196\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.676\n",
      "Final Judgemark (cal)  = 0.729\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.491\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.276\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (8.35, 3.9, 2.81, 8.38, 5.28, 1.81, 1.84, 2.94, 0.41, 4.48, 2.38, 9.39, 4.17, 9.56, 9.99, 3.77, 3.67) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7058823529411764\n",
      "k_tau (7.0, 5.0, 3.0, 7.05487804878049, 5.620224719101124, 1.75, 1.7875000000000003, 3.238532110091743, 0.0, 5.260674157303371, 2.4625, 8.902439024390246, 5.121348314606742, 9.213414634146343, 10.0, 4.761467889908257, 4.577981651376147) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7058823529411764\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8826\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.9711\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.990 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.560 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.390 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.380 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.350 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.280 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.480 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 4.170 ±0.000\n",
      "claude-3-haiku-20240307................. 3.900 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.770 ±0.000\n",
      "databricks/dbrx-instruct................ 3.670 ±0.000\n",
      "gemma-7b-it............................. 2.940 ±0.000\n",
      "claude-3-opus-20240229.................. 2.810 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.380 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.840 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.810 ±0.000\n",
      "gemma-2b-it............................. 0.410 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8827\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.7260\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.213 ±0.000\n",
      "gemini-1.5-pro-002...................... 8.902 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.055 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.620 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.261 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 5.121 ±0.000\n",
      "claude-3-haiku-20240307................. 5.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 4.761 ±0.000\n",
      "databricks/dbrx-instruct................ 4.578 ±0.000\n",
      "gemma-7b-it............................. 3.239 ±0.000\n",
      "claude-3-opus-20240229.................. 3.000 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.462 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.788 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.750 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.491\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.276\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.936\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 1.000 \n",
      "(0.7058823529411764)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8826\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.9711\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.990 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.560 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.390 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.380 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.350 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.280 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.480 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 4.170 ±0.000\n",
      "claude-3-haiku-20240307................. 3.900 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.770 ±0.000\n",
      "databricks/dbrx-instruct................ 3.670 ±0.000\n",
      "gemma-7b-it............................. 2.940 ±0.000\n",
      "claude-3-opus-20240229.................. 2.810 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.380 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.840 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.810 ±0.000\n",
      "gemma-2b-it............................. 0.410 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8827\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.7260\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.213 ±0.000\n",
      "gemini-1.5-pro-002...................... 8.902 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.055 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.620 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.261 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 5.121 ±0.000\n",
      "claude-3-haiku-20240307................. 5.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 4.761 ±0.000\n",
      "databricks/dbrx-instruct................ 4.578 ±0.000\n",
      "gemma-7b-it............................. 3.239 ±0.000\n",
      "claude-3-opus-20240229.................. 3.000 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.462 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.788 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.750 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.798\n",
      "Final Judgemark (cal)  = 0.814\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.812\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "11.962\n",
      "CI99 Overlap pct: \n",
      "0.574\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.241\n",
      "Average EMD across all pairs: 1.986\n",
      "Avg. CI95 half-width: 0.277 (modulated: 0.453)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.812\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.282\n",
      "CI99 Overlap pct: \n",
      "0.562\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.245\n",
      "Average EMD across all pairs: 2.294\n",
      "Avg. CI95 half-width: 0.294 (modulated: 0.411)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.90625, 6.1025, 7.537083333333333, 8.257083333333334, 6.047083333333333, 4.809583333333333, 4.222083333333333, 3.8558333333333334, 3.8521739130434782, 6.319583333333333, 5.580416666666666, 8.144166666666667, 6.039583333333334, 8.364166666666666, 9.458333333333334, 4.4575, 4.972083333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8382352941176471\n",
      "k_tau (7.01956474591423, 4.8085319401425295, 6.468393670309654, 7.4536043906810034, 4.604115568314405, 3.458484445955361, 2.9152925721672416, 2.6918678493784256, 2.626204041591805, 4.939243025652003, 4.141164958491769, 7.31671106557377, 4.958440807752679, 7.705739562086071, 9.189930555555556, 3.2051324360168887, 3.5961390210940714) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8823529411764705\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 138.0117, p=0.0000\n",
      "Kruskal-Wallis: 1124.9792, p=0.0000\n",
      "Pearson r=0.9539\n",
      "Kendall τ=0.8706\n",
      "Std.Dev across models: 1.6379\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.480 ±0.179\n",
      "gpt-4o-2024-11-20....................... 8.541 ±0.223\n",
      "gemini-1.5-pro-002...................... 8.118 ±0.246\n",
      "claude-3-5-sonnet-20240620.............. 8.062 ±0.226\n",
      "gemini-1.5-pro-001...................... 7.984 ±0.204\n",
      "claude-3-opus-20240229.................. 7.213 ±0.280\n",
      "Llama-3-70b-chat-hf..................... 6.189 ±0.253\n",
      "Mistral-Large-Instruct-2411............. 6.094 ±0.373\n",
      "claude-3-haiku-20240307................. 6.031 ±0.287\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.777 ±0.297\n",
      "c4ai-command-r-08-2024.................. 5.493 ±0.278\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.247 ±0.304\n",
      "databricks/dbrx-instruct................ 5.048 ±0.296\n",
      "gpt-3.5-turbo-0125...................... 4.617 ±0.300\n",
      "Llama-2-13b-chat-hf..................... 4.335 ±0.315\n",
      "gemma-7b-it............................. 4.186 ±0.328\n",
      "gemma-2b-it............................. 3.949 ±0.329\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 172.3690, p=0.0000\n",
      "Kruskal-Wallis: 1124.9792, p=0.0000\n",
      "Pearson r=0.9510\n",
      "Kendall τ=0.8794\n",
      "Std.Dev across models: 1.9227\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.256 ±0.232\n",
      "gpt-4o-2024-11-20....................... 7.887 ±0.296\n",
      "gemini-1.5-pro-002...................... 7.303 ±0.320\n",
      "claude-3-5-sonnet-20240620.............. 7.205 ±0.297\n",
      "gemini-1.5-pro-001...................... 7.076 ±0.276\n",
      "claude-3-opus-20240229.................. 6.106 ±0.352\n",
      "Mistral-Large-Instruct-2411............. 4.890 ±0.403\n",
      "Llama-3-70b-chat-hf..................... 4.818 ±0.281\n",
      "claude-3-haiku-20240307................. 4.686 ±0.302\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.410 ±0.297\n",
      "c4ai-command-r-08-2024.................. 4.091 ±0.272\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.864 ±0.283\n",
      "databricks/dbrx-instruct................ 3.665 ±0.279\n",
      "gpt-3.5-turbo-0125...................... 3.263 ±0.267\n",
      "Llama-2-13b-chat-hf..................... 3.028 ±0.275\n",
      "gemma-7b-it............................. 2.897 ±0.278\n",
      "gemma-2b-it............................. 2.708 ±0.280\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.812\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "11.962\n",
      "CI99 Overlap pct: \n",
      "0.574\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.241\n",
      "Average EMD across all pairs: 1.986\n",
      "Avg. CI95 half-width: 0.277 (modulated: 0.453)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.812\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.282\n",
      "CI99 Overlap pct: \n",
      "0.562\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.245\n",
      "Average EMD across all pairs: 2.294\n",
      "Avg. CI95 half-width: 0.294 (modulated: 0.411)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.936\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.873 \n",
      "(0.8794117647058822)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 138.0117, p=0.0000\n",
      "Kruskal-Wallis: 1124.9792, p=0.0000\n",
      "Pearson r=0.9539\n",
      "Kendall τ=0.8706\n",
      "Std.Dev across models: 1.6379\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.480 ±0.179\n",
      "gpt-4o-2024-11-20....................... 8.541 ±0.223\n",
      "gemini-1.5-pro-002...................... 8.118 ±0.246\n",
      "claude-3-5-sonnet-20240620.............. 8.062 ±0.226\n",
      "gemini-1.5-pro-001...................... 7.984 ±0.204\n",
      "claude-3-opus-20240229.................. 7.213 ±0.280\n",
      "Llama-3-70b-chat-hf..................... 6.189 ±0.253\n",
      "Mistral-Large-Instruct-2411............. 6.094 ±0.373\n",
      "claude-3-haiku-20240307................. 6.031 ±0.287\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.777 ±0.297\n",
      "c4ai-command-r-08-2024.................. 5.493 ±0.278\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.247 ±0.304\n",
      "databricks/dbrx-instruct................ 5.048 ±0.296\n",
      "gpt-3.5-turbo-0125...................... 4.617 ±0.300\n",
      "Llama-2-13b-chat-hf..................... 4.335 ±0.315\n",
      "gemma-7b-it............................. 4.186 ±0.328\n",
      "gemma-2b-it............................. 3.949 ±0.329\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 172.3690, p=0.0000\n",
      "Kruskal-Wallis: 1124.9792, p=0.0000\n",
      "Pearson r=0.9510\n",
      "Kendall τ=0.8794\n",
      "Std.Dev across models: 1.9227\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.256 ±0.232\n",
      "gpt-4o-2024-11-20....................... 7.887 ±0.296\n",
      "gemini-1.5-pro-002...................... 7.303 ±0.320\n",
      "claude-3-5-sonnet-20240620.............. 7.205 ±0.297\n",
      "gemini-1.5-pro-001...................... 7.076 ±0.276\n",
      "claude-3-opus-20240229.................. 6.106 ±0.352\n",
      "Mistral-Large-Instruct-2411............. 4.890 ±0.403\n",
      "Llama-3-70b-chat-hf..................... 4.818 ±0.281\n",
      "claude-3-haiku-20240307................. 4.686 ±0.302\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.410 ±0.297\n",
      "c4ai-command-r-08-2024.................. 4.091 ±0.272\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.864 ±0.283\n",
      "databricks/dbrx-instruct................ 3.665 ±0.279\n",
      "gpt-3.5-turbo-0125...................... 3.263 ±0.267\n",
      "Llama-2-13b-chat-hf..................... 3.028 ±0.275\n",
      "gemma-7b-it............................. 2.897 ±0.278\n",
      "gemma-2b-it............................. 2.708 ±0.280\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.645\n",
      "Final Judgemark (cal)  = 0.658\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.491\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.276\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (8.35, 3.9, 2.81, 8.38, 5.28, 1.81, 1.84, 2.94, 0.41, 4.48, 2.38, 9.39, 4.17, 9.56, 9.99, 3.77, 3.67) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7058823529411764\n",
      "k_tau (7.0, 5.0, 3.0, 7.05487804878049, 5.620224719101124, 1.75, 1.7875000000000003, 3.238532110091743, 0.0, 5.260674157303371, 2.4625, 8.902439024390246, 5.121348314606742, 9.213414634146343, 10.0, 4.761467889908257, 4.577981651376147) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7058823529411764\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8826\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.9711\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.990 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.560 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.390 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.380 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.350 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.280 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.480 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 4.170 ±0.000\n",
      "claude-3-haiku-20240307................. 3.900 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.770 ±0.000\n",
      "databricks/dbrx-instruct................ 3.670 ±0.000\n",
      "gemma-7b-it............................. 2.940 ±0.000\n",
      "claude-3-opus-20240229.................. 2.810 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.380 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.840 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.810 ±0.000\n",
      "gemma-2b-it............................. 0.410 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8827\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.7260\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.213 ±0.000\n",
      "gemini-1.5-pro-002...................... 8.902 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.055 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.620 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.261 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 5.121 ±0.000\n",
      "claude-3-haiku-20240307................. 5.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 4.761 ±0.000\n",
      "databricks/dbrx-instruct................ 4.578 ±0.000\n",
      "gemma-7b-it............................. 3.239 ±0.000\n",
      "claude-3-opus-20240229.................. 3.000 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.462 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.788 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.750 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.491\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.276\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.936\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 1.000 \n",
      "(0.7058823529411764)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8826\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.9711\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.990 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.560 ±0.000\n",
      "gemini-1.5-pro-002...................... 9.390 ±0.000\n",
      "gemini-1.5-pro-001...................... 8.380 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 8.350 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.280 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.480 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 4.170 ±0.000\n",
      "claude-3-haiku-20240307................. 3.900 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.770 ±0.000\n",
      "databricks/dbrx-instruct................ 3.670 ±0.000\n",
      "gemma-7b-it............................. 2.940 ±0.000\n",
      "claude-3-opus-20240229.................. 2.810 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.380 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.840 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.810 ±0.000\n",
      "gemma-2b-it............................. 0.410 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8827\n",
      "Kendall τ=0.7059\n",
      "Std.Dev across models: 2.7260\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.213 ±0.000\n",
      "gemini-1.5-pro-002...................... 8.902 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.055 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 5.620 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.261 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 5.121 ±0.000\n",
      "claude-3-haiku-20240307................. 5.000 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 4.761 ±0.000\n",
      "databricks/dbrx-instruct................ 4.578 ±0.000\n",
      "gemma-7b-it............................. 3.239 ±0.000\n",
      "claude-3-opus-20240229.................. 3.000 ±0.000\n",
      "c4ai-command-r-08-2024.................. 2.462 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.788 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.750 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.798\n",
      "Final Judgemark (cal)  = 0.814\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.473\n",
      "CI99 Overlap pct: \n",
      "0.669\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.187\n",
      "Average EMD across all pairs: 1.175\n",
      "Avg. CI95 half-width: 0.178 (modulated: 0.696)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.447\n",
      "CI99 Overlap pct: \n",
      "0.660\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.190\n",
      "Average EMD across all pairs: 1.934\n",
      "Avg. CI95 half-width: 0.268 (modulated: 0.466)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.66, 5.894583333333333, 7.005416666666667, 7.742083333333333, 5.959583333333333, 5.341666666666667, 5.14875, 5.210833333333333, 5.216956521739131, 5.952083333333333, 5.8175, 7.75625, 6.177083333333333, 7.79125, 8.180833333333334, 5.399583333333333, 5.343333333333334) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8529411764705882\n",
      "k_tau (7.219993057787967, 4.203642136690829, 6.219873222118955, 7.492575200193227, 4.288872162236988, 3.4679307086647206, 3.178556091437632, 3.369565804431357, 3.277747046516005, 4.225610680098326, 4.112337033121917, 7.519547869869759, 4.816917299883649, 7.5213677335123155, 8.080081377129487, 3.5936786494999327, 3.4201084767945233) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8823529411764705\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 126.7344, p=0.0000\n",
      "Kruskal-Wallis: 1086.4958, p=0.0000\n",
      "Pearson r=0.9142\n",
      "Kendall τ=0.8265\n",
      "Std.Dev across models: 1.0003\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.086 ±0.141\n",
      "gpt-4o-2024-11-20....................... 7.784 ±0.154\n",
      "claude-3-5-sonnet-20240620.............. 7.732 ±0.175\n",
      "gemini-1.5-pro-002...................... 7.670 ±0.161\n",
      "gemini-1.5-pro-001...................... 7.582 ±0.152\n",
      "claude-3-opus-20240229.................. 6.819 ±0.168\n",
      "Mistral-Large-Instruct-2411............. 6.205 ±0.213\n",
      "Llama-3-70b-chat-hf..................... 5.953 ±0.162\n",
      "claude-3-haiku-20240307................. 5.941 ±0.150\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.809 ±0.176\n",
      "c4ai-command-r-08-2024.................. 5.756 ±0.179\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.593 ±0.195\n",
      "databricks/dbrx-instruct................ 5.575 ±0.189\n",
      "gpt-3.5-turbo-0125...................... 5.419 ±0.195\n",
      "gemma-7b-it............................. 5.332 ±0.211\n",
      "Llama-2-13b-chat-hf..................... 5.283 ±0.197\n",
      "gemma-2b-it............................. 5.188 ±0.211\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 156.6239, p=0.0000\n",
      "Kruskal-Wallis: 1086.4958, p=0.0000\n",
      "Pearson r=0.9073\n",
      "Kendall τ=0.8235\n",
      "Std.Dev across models: 1.6685\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 7.942 ±0.218\n",
      "gpt-4o-2024-11-20....................... 7.504 ±0.243\n",
      "claude-3-5-sonnet-20240620.............. 7.360 ±0.283\n",
      "gemini-1.5-pro-002...................... 7.309 ±0.264\n",
      "gemini-1.5-pro-001...................... 7.134 ±0.260\n",
      "claude-3-opus-20240229.................. 5.858 ±0.299\n",
      "Mistral-Large-Instruct-2411............. 4.821 ±0.338\n",
      "Llama-3-70b-chat-hf..................... 4.278 ±0.244\n",
      "claude-3-haiku-20240307................. 4.238 ±0.223\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.078 ±0.259\n",
      "c4ai-command-r-08-2024.................. 4.017 ±0.262\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.788 ±0.272\n",
      "databricks/dbrx-instruct................ 3.770 ±0.272\n",
      "gpt-3.5-turbo-0125...................... 3.550 ±0.273\n",
      "gemma-7b-it............................. 3.452 ±0.291\n",
      "Llama-2-13b-chat-hf..................... 3.361 ±0.274\n",
      "gemma-2b-it............................. 3.235 ±0.281\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.473\n",
      "CI99 Overlap pct: \n",
      "0.669\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.187\n",
      "Average EMD across all pairs: 1.175\n",
      "Avg. CI95 half-width: 0.178 (modulated: 0.696)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.447\n",
      "CI99 Overlap pct: \n",
      "0.660\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.190\n",
      "Average EMD across all pairs: 1.934\n",
      "Avg. CI95 half-width: 0.268 (modulated: 0.466)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.937\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.784 \n",
      "(0.8235294117647057)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 126.7344, p=0.0000\n",
      "Kruskal-Wallis: 1086.4958, p=0.0000\n",
      "Pearson r=0.9142\n",
      "Kendall τ=0.8265\n",
      "Std.Dev across models: 1.0003\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.086 ±0.141\n",
      "gpt-4o-2024-11-20....................... 7.784 ±0.154\n",
      "claude-3-5-sonnet-20240620.............. 7.732 ±0.175\n",
      "gemini-1.5-pro-002...................... 7.670 ±0.161\n",
      "gemini-1.5-pro-001...................... 7.582 ±0.152\n",
      "claude-3-opus-20240229.................. 6.819 ±0.168\n",
      "Mistral-Large-Instruct-2411............. 6.205 ±0.213\n",
      "Llama-3-70b-chat-hf..................... 5.953 ±0.162\n",
      "claude-3-haiku-20240307................. 5.941 ±0.150\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.809 ±0.176\n",
      "c4ai-command-r-08-2024.................. 5.756 ±0.179\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.593 ±0.195\n",
      "databricks/dbrx-instruct................ 5.575 ±0.189\n",
      "gpt-3.5-turbo-0125...................... 5.419 ±0.195\n",
      "gemma-7b-it............................. 5.332 ±0.211\n",
      "Llama-2-13b-chat-hf..................... 5.283 ±0.197\n",
      "gemma-2b-it............................. 5.188 ±0.211\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 156.6239, p=0.0000\n",
      "Kruskal-Wallis: 1086.4958, p=0.0000\n",
      "Pearson r=0.9073\n",
      "Kendall τ=0.8235\n",
      "Std.Dev across models: 1.6685\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 7.942 ±0.218\n",
      "gpt-4o-2024-11-20....................... 7.504 ±0.243\n",
      "claude-3-5-sonnet-20240620.............. 7.360 ±0.283\n",
      "gemini-1.5-pro-002...................... 7.309 ±0.264\n",
      "gemini-1.5-pro-001...................... 7.134 ±0.260\n",
      "claude-3-opus-20240229.................. 5.858 ±0.299\n",
      "Mistral-Large-Instruct-2411............. 4.821 ±0.338\n",
      "Llama-3-70b-chat-hf..................... 4.278 ±0.244\n",
      "claude-3-haiku-20240307................. 4.238 ±0.223\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.078 ±0.259\n",
      "c4ai-command-r-08-2024.................. 4.017 ±0.262\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.788 ±0.272\n",
      "databricks/dbrx-instruct................ 3.770 ±0.272\n",
      "gpt-3.5-turbo-0125...................... 3.550 ±0.273\n",
      "gemma-7b-it............................. 3.452 ±0.291\n",
      "Llama-2-13b-chat-hf..................... 3.361 ±0.274\n",
      "gemma-2b-it............................. 3.235 ±0.281\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.572\n",
      "Final Judgemark (cal)  = 0.575\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving results to ../outputs/my_judgemark_runs_20250728_184300_3c750455-b8f0-485b-9ed8-9dd8b689e219__deepseek_deepseek-r1.parquet\n",
      "| name          | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | human_correlation | human_correlation_calib |\n",
      "|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|\n",
      "| weighted      | 0.666           | 0.714                 | 0.896     | 0.871           | 0.55         | 0.627              | 0.902             | 0.902                   |\n",
      "| raw           | 0.676           | 0.729                 | 0.895     | 0.892           | 0.565        | 0.646              | 0.902             | 0.899                   |\n",
      "| ranked        | 0.798           | 0.814                 | 0.893     | 1.0             | 0.805        | 0.803              | 0.673             | 0.673                   |\n",
      "| ranked_norm   | 0.645           | 0.658                 | 0.893     | 0.789           | 0.531        | 0.574              | 0.856             | 0.866                   |\n",
      "| ranked_scaled | 0.798           | 0.814                 | 0.893     | 1.0             | 0.805        | 0.803              | 0.673             | 0.673                   |\n",
      "| weighted_norm | 0.572           | 0.575                 | 0.894     | 0.64            | 0.433        | 0.501              | 0.807             | 0.804                   |\n",
      "| name          | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | human_correlation | human_correlation_calib |\n",
      "|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|\n",
      "| weighted      | 0.666           | 0.714                 | 0.896     | 0.871           | 0.55         | 0.627              | 0.902             | 0.902                   |\n",
      "| raw           | 0.676           | 0.729                 | 0.895     | 0.892           | 0.565        | 0.646              | 0.902             | 0.899                   |\n",
      "| ranked        | 0.798           | 0.814                 | 0.893     | 1.0             | 0.805        | 0.803              | 0.673             | 0.673                   |\n",
      "| ranked_norm   | 0.645           | 0.658                 | 0.893     | 0.789           | 0.531        | 0.574              | 0.856             | 0.866                   |\n",
      "| ranked_scaled | 0.798           | 0.814                 | 0.893     | 1.0             | 0.805        | 0.803              | 0.673             | 0.673                   |\n",
      "| weighted_norm | 0.572           | 0.575                 | 0.894     | 0.64            | 0.433        | 0.501              | 0.807             | 0.804                   |\n",
      "\n",
      "Run ID: 3c750455-b8f0-485b-9ed8-9dd8b689e219__deepseek_deepseek-r1\n",
      "\n",
      "\n",
      "Results saved to ../outputs/my_judgemark_runs_20250728_184300_3c750455-b8f0-485b-9ed8-9dd8b689e219__deepseek_deepseek-r1.parquet and ../outputs/my_judgemark_runs_20250728_184300.md\n",
      "\n",
      "\n",
      "Processing ../outputs/my_judgemark_runs_20250729_050856.json...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1ff94c5e97e94c04abad17ddcb188fea",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Recomputing scores with choice norm:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing run 8b031c01-ac20-4f6d-8838-2d8481c61a55__deepseek_deepseek-r1...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.699\n",
      "CI99 Overlap pct: \n",
      "0.418\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.339\n",
      "Average EMD across all pairs: 1.345\n",
      "Avg. CI95 half-width: 0.136 (modulated: 0.844)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.945\n",
      "CI99 Overlap pct: \n",
      "0.418\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.331\n",
      "Average EMD across all pairs: 2.173\n",
      "Avg. CI95 half-width: 0.217 (modulated: 0.625)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.056521739130434, 5.692083333333334, 6.568333333333333, 6.972083333333333, 6.02625, 5.292916666666667, 4.985, 4.421666666666667, 3.9104166666666664, 5.91625, 5.632083333333333, 7.324166666666667, 5.82875, 7.610833333333333, 8.12625, 4.810416666666667, 5.170833333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8823529411764705\n",
      "k_tau (6.913741010558691, 4.745233729099284, 6.136213642220673, 6.867702086872284, 5.3350524958555905, 3.9637780431712675, 3.4473454755860238, 2.6185410443378423, 1.9714928350431529, 5.109234456943669, 4.540345540824161, 7.3509579707211765, 4.885496283271738, 7.700297215002195, 8.38722761924037, 3.1579257830331477, 3.8035150809926783) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8823529411764705\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 262.4392, p=0.0000\n",
      "Kruskal-Wallis: 1360.1823, p=0.0000\n",
      "Pearson r=0.9637\n",
      "Kendall τ=0.8933\n",
      "Std.Dev across models: 1.1095\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.102 ±0.095\n",
      "gpt-4o-2024-11-20....................... 7.590 ±0.115\n",
      "gemini-1.5-pro-002...................... 7.144 ±0.111\n",
      "claude-3-5-sonnet-20240620.............. 7.083 ±0.151\n",
      "gemini-1.5-pro-001...................... 7.024 ±0.117\n",
      "claude-3-opus-20240229.................. 6.419 ±0.181\n",
      "Llama-3-70b-chat-hf..................... 6.008 ±0.133\n",
      "claude-3-haiku-20240307................. 5.779 ±0.144\n",
      "Mistral-Large-Instruct-2411............. 5.728 ±0.191\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.624 ±0.150\n",
      "c4ai-command-r-08-2024.................. 5.399 ±0.130\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.318 ±0.140\n",
      "databricks/dbrx-instruct................ 5.055 ±0.153\n",
      "gpt-3.5-turbo-0125...................... 4.885 ±0.113\n",
      "Llama-2-13b-chat-hf..................... 4.809 ±0.119\n",
      "gemma-7b-it............................. 4.552 ±0.127\n",
      "gemma-2b-it............................. 4.091 ±0.146\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 261.7218, p=0.0000\n",
      "Kruskal-Wallis: 1360.1823, p=0.0000\n",
      "Pearson r=0.9600\n",
      "Kendall τ=0.8882\n",
      "Std.Dev across models: 1.7867\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.360 ±0.117\n",
      "gpt-4o-2024-11-20....................... 7.688 ±0.167\n",
      "gemini-1.5-pro-002...................... 7.098 ±0.163\n",
      "claude-3-5-sonnet-20240620.............. 6.977 ±0.228\n",
      "gemini-1.5-pro-001...................... 6.922 ±0.179\n",
      "claude-3-opus-20240229.................. 5.914 ±0.294\n",
      "Llama-3-70b-chat-hf..................... 5.277 ±0.231\n",
      "claude-3-haiku-20240307................. 4.867 ±0.254\n",
      "Mistral-Large-Instruct-2411............. 4.742 ±0.324\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.585 ±0.265\n",
      "c4ai-command-r-08-2024.................. 4.157 ±0.235\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.029 ±0.247\n",
      "databricks/dbrx-instruct................ 3.581 ±0.248\n",
      "gpt-3.5-turbo-0125...................... 3.286 ±0.179\n",
      "Llama-2-13b-chat-hf..................... 3.187 ±0.191\n",
      "gemma-7b-it............................. 2.780 ±0.186\n",
      "gemma-2b-it............................. 2.209 ±0.186\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.699\n",
      "CI99 Overlap pct: \n",
      "0.418\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.339\n",
      "Average EMD across all pairs: 1.345\n",
      "Avg. CI95 half-width: 0.136 (modulated: 0.844)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.945\n",
      "CI99 Overlap pct: \n",
      "0.418\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.331\n",
      "Average EMD across all pairs: 2.173\n",
      "Avg. CI95 half-width: 0.217 (modulated: 0.625)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.937\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.927 \n",
      "(0.888235294117647)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 262.4392, p=0.0000\n",
      "Kruskal-Wallis: 1360.1823, p=0.0000\n",
      "Pearson r=0.9637\n",
      "Kendall τ=0.8933\n",
      "Std.Dev across models: 1.1095\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.102 ±0.095\n",
      "gpt-4o-2024-11-20....................... 7.590 ±0.115\n",
      "gemini-1.5-pro-002...................... 7.144 ±0.111\n",
      "claude-3-5-sonnet-20240620.............. 7.083 ±0.151\n",
      "gemini-1.5-pro-001...................... 7.024 ±0.117\n",
      "claude-3-opus-20240229.................. 6.419 ±0.181\n",
      "Llama-3-70b-chat-hf..................... 6.008 ±0.133\n",
      "claude-3-haiku-20240307................. 5.779 ±0.144\n",
      "Mistral-Large-Instruct-2411............. 5.728 ±0.191\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.624 ±0.150\n",
      "c4ai-command-r-08-2024.................. 5.399 ±0.130\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.318 ±0.140\n",
      "databricks/dbrx-instruct................ 5.055 ±0.153\n",
      "gpt-3.5-turbo-0125...................... 4.885 ±0.113\n",
      "Llama-2-13b-chat-hf..................... 4.809 ±0.119\n",
      "gemma-7b-it............................. 4.552 ±0.127\n",
      "gemma-2b-it............................. 4.091 ±0.146\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 261.7218, p=0.0000\n",
      "Kruskal-Wallis: 1360.1823, p=0.0000\n",
      "Pearson r=0.9600\n",
      "Kendall τ=0.8882\n",
      "Std.Dev across models: 1.7867\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.360 ±0.117\n",
      "gpt-4o-2024-11-20....................... 7.688 ±0.167\n",
      "gemini-1.5-pro-002...................... 7.098 ±0.163\n",
      "claude-3-5-sonnet-20240620.............. 6.977 ±0.228\n",
      "gemini-1.5-pro-001...................... 6.922 ±0.179\n",
      "claude-3-opus-20240229.................. 5.914 ±0.294\n",
      "Llama-3-70b-chat-hf..................... 5.277 ±0.231\n",
      "claude-3-haiku-20240307................. 4.867 ±0.254\n",
      "Mistral-Large-Instruct-2411............. 4.742 ±0.324\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.585 ±0.265\n",
      "c4ai-command-r-08-2024.................. 4.157 ±0.235\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 4.029 ±0.247\n",
      "databricks/dbrx-instruct................ 3.581 ±0.248\n",
      "gpt-3.5-turbo-0125...................... 3.286 ±0.179\n",
      "Llama-2-13b-chat-hf..................... 3.187 ±0.191\n",
      "gemma-7b-it............................. 2.780 ±0.186\n",
      "gemma-2b-it............................. 2.209 ±0.186\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.668\n",
      "Final Judgemark (cal)  = 0.716\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.375\n",
      "CI99 Overlap pct: \n",
      "0.387\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.349\n",
      "Average EMD across all pairs: 1.373\n",
      "Avg. CI95 half-width: 0.132 (modulated: 0.856)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.390\n",
      "CI99 Overlap pct: \n",
      "0.393\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.350\n",
      "Average EMD across all pairs: 2.218\n",
      "Avg. CI95 half-width: 0.209 (modulated: 0.650)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.225217391304348, 5.815416666666667, 6.632916666666667, 6.968333333333334, 6.0408333333333335, 5.284166666666667, 4.99, 4.42125, 3.9050000000000002, 5.92625, 5.6375, 7.366666666666667, 5.835, 7.787083333333333, 8.139166666666666, 4.799166666666666, 5.17875) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8823529411764705\n",
      "k_tau (7.132286761482418, 4.862278813139331, 6.157394561436324, 6.808124622841381, 5.283508319518634, 3.887296959851807, 3.3930343055223524, 2.5893801217261125, 1.939651565861822, 5.055792381388108, 4.47494564943531, 7.360107003223815, 4.844207048311629, 7.928798843084802, 8.376186066716967, 3.0941657064961094, 3.755333962941543) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8676470588235293\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 288.6419, p=0.0000\n",
      "Kruskal-Wallis: 1412.4994, p=0.0000\n",
      "Pearson r=0.9643\n",
      "Kendall τ=0.8941\n",
      "Std.Dev across models: 1.1312\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.108 ±0.095\n",
      "gpt-4o-2024-11-20....................... 7.675 ±0.096\n",
      "claude-3-5-sonnet-20240620.............. 7.202 ±0.125\n",
      "gemini-1.5-pro-002...................... 7.202 ±0.105\n",
      "gemini-1.5-pro-001...................... 7.065 ±0.114\n",
      "claude-3-opus-20240229.................. 6.487 ±0.167\n",
      "Llama-3-70b-chat-hf..................... 6.068 ±0.129\n",
      "claude-3-haiku-20240307................. 5.830 ±0.137\n",
      "Mistral-Large-Instruct-2411............. 5.797 ±0.193\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.645 ±0.149\n",
      "c4ai-command-r-08-2024.................. 5.437 ±0.130\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.315 ±0.142\n",
      "databricks/dbrx-instruct................ 5.053 ±0.154\n",
      "gpt-3.5-turbo-0125...................... 4.885 ±0.115\n",
      "Llama-2-13b-chat-hf..................... 4.817 ±0.120\n",
      "gemma-7b-it............................. 4.543 ±0.129\n",
      "gemma-2b-it............................. 4.095 ±0.150\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 292.0344, p=0.0000\n",
      "Kruskal-Wallis: 1412.4994, p=0.0000\n",
      "Pearson r=0.9603\n",
      "Kendall τ=0.8941\n",
      "Std.Dev across models: 1.8233\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.339 ±0.120\n",
      "gpt-4o-2024-11-20....................... 7.782 ±0.127\n",
      "gemini-1.5-pro-002...................... 7.134 ±0.152\n",
      "claude-3-5-sonnet-20240620.............. 7.117 ±0.180\n",
      "gemini-1.5-pro-001...................... 6.933 ±0.172\n",
      "claude-3-opus-20240229.................. 5.954 ±0.278\n",
      "Llama-3-70b-chat-hf..................... 5.312 ±0.225\n",
      "claude-3-haiku-20240307................. 4.880 ±0.246\n",
      "Mistral-Large-Instruct-2411............. 4.803 ±0.326\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.551 ±0.262\n",
      "c4ai-command-r-08-2024.................. 4.151 ±0.235\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.961 ±0.247\n",
      "databricks/dbrx-instruct................ 3.523 ±0.246\n",
      "gpt-3.5-turbo-0125...................... 3.229 ±0.177\n",
      "Llama-2-13b-chat-hf..................... 3.141 ±0.187\n",
      "gemma-7b-it............................. 2.730 ±0.185\n",
      "gemma-2b-it............................. 2.189 ±0.188\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.688\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "4.375\n",
      "CI99 Overlap pct: \n",
      "0.387\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.349\n",
      "Average EMD across all pairs: 1.373\n",
      "Avg. CI95 half-width: 0.132 (modulated: 0.856)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "7.390\n",
      "CI99 Overlap pct: \n",
      "0.393\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.350\n",
      "Average EMD across all pairs: 2.218\n",
      "Avg. CI95 half-width: 0.209 (modulated: 0.650)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.939\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.937 \n",
      "(0.8941176470588236)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 288.6419, p=0.0000\n",
      "Kruskal-Wallis: 1412.4994, p=0.0000\n",
      "Pearson r=0.9643\n",
      "Kendall τ=0.8941\n",
      "Std.Dev across models: 1.1312\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.108 ±0.095\n",
      "gpt-4o-2024-11-20....................... 7.675 ±0.096\n",
      "claude-3-5-sonnet-20240620.............. 7.202 ±0.125\n",
      "gemini-1.5-pro-002...................... 7.202 ±0.105\n",
      "gemini-1.5-pro-001...................... 7.065 ±0.114\n",
      "claude-3-opus-20240229.................. 6.487 ±0.167\n",
      "Llama-3-70b-chat-hf..................... 6.068 ±0.129\n",
      "claude-3-haiku-20240307................. 5.830 ±0.137\n",
      "Mistral-Large-Instruct-2411............. 5.797 ±0.193\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.645 ±0.149\n",
      "c4ai-command-r-08-2024.................. 5.437 ±0.130\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.315 ±0.142\n",
      "databricks/dbrx-instruct................ 5.053 ±0.154\n",
      "gpt-3.5-turbo-0125...................... 4.885 ±0.115\n",
      "Llama-2-13b-chat-hf..................... 4.817 ±0.120\n",
      "gemma-7b-it............................. 4.543 ±0.129\n",
      "gemma-2b-it............................. 4.095 ±0.150\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 292.0344, p=0.0000\n",
      "Kruskal-Wallis: 1412.4994, p=0.0000\n",
      "Pearson r=0.9603\n",
      "Kendall τ=0.8941\n",
      "Std.Dev across models: 1.8233\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.339 ±0.120\n",
      "gpt-4o-2024-11-20....................... 7.782 ±0.127\n",
      "gemini-1.5-pro-002...................... 7.134 ±0.152\n",
      "claude-3-5-sonnet-20240620.............. 7.117 ±0.180\n",
      "gemini-1.5-pro-001...................... 6.933 ±0.172\n",
      "claude-3-opus-20240229.................. 5.954 ±0.278\n",
      "Llama-3-70b-chat-hf..................... 5.312 ±0.225\n",
      "claude-3-haiku-20240307................. 4.880 ±0.246\n",
      "Mistral-Large-Instruct-2411............. 4.803 ±0.326\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.551 ±0.262\n",
      "c4ai-command-r-08-2024.................. 4.151 ±0.235\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.961 ±0.247\n",
      "databricks/dbrx-instruct................ 3.523 ±0.246\n",
      "gpt-3.5-turbo-0125...................... 3.229 ±0.177\n",
      "Llama-2-13b-chat-hf..................... 3.141 ±0.187\n",
      "gemma-7b-it............................. 2.730 ±0.185\n",
      "gemma-2b-it............................. 2.189 ±0.188\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.678\n",
      "Final Judgemark (cal)  = 0.731\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 2.924\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.290\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (6.78, 4.12, 3.78, 6.94, 2.99, 2.19, 2.01, 3.74, 0.91, 4.72, 3.41, 7.67, 6.08, 9.31, 9.56, 3.39, 3.08) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7205882352941176\n",
      "k_tau (7.0, 5.226666666666667, 5.0, 7.172661870503597, 2.8755760368663594, 1.7695852534562209, 1.5207373271889395, 4.885714285714287, 0.0, 5.626666666666667, 3.9428571428571435, 7.960431654676259, 6.533333333333333, 9.73021582733813, 10.0, 3.885714285714286, 3.0) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7205882352941176\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.9137\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.4715\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.560 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.310 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.670 ±0.000\n",
      "gemini-1.5-pro-001...................... 6.940 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 6.780 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.080 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.720 ±0.000\n",
      "claude-3-haiku-20240307................. 4.120 ±0.000\n",
      "claude-3-opus-20240229.................. 3.780 ±0.000\n",
      "gemma-7b-it............................. 3.740 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.410 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.390 ±0.000\n",
      "databricks/dbrx-instruct................ 3.080 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.990 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 2.190 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 2.010 ±0.000\n",
      "gemma-2b-it............................. 0.910 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8873\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.7225\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.730 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.960 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.173 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.533 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.627 ±0.000\n",
      "claude-3-haiku-20240307................. 5.227 ±0.000\n",
      "claude-3-opus-20240229.................. 5.000 ±0.000\n",
      "gemma-7b-it............................. 4.886 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.943 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.886 ±0.000\n",
      "databricks/dbrx-instruct................ 3.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.876 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.770 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.521 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 2.924\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.290\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.938\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 1.000 \n",
      "(0.7205882352941176)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.9137\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.4715\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.560 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.310 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.670 ±0.000\n",
      "gemini-1.5-pro-001...................... 6.940 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 6.780 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.080 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.720 ±0.000\n",
      "claude-3-haiku-20240307................. 4.120 ±0.000\n",
      "claude-3-opus-20240229.................. 3.780 ±0.000\n",
      "gemma-7b-it............................. 3.740 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.410 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.390 ±0.000\n",
      "databricks/dbrx-instruct................ 3.080 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.990 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 2.190 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 2.010 ±0.000\n",
      "gemma-2b-it............................. 0.910 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8873\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.7225\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.730 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.960 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.173 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.533 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.627 ±0.000\n",
      "claude-3-haiku-20240307................. 5.227 ±0.000\n",
      "claude-3-opus-20240229.................. 5.000 ±0.000\n",
      "gemma-7b-it............................. 4.886 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.943 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.886 ±0.000\n",
      "databricks/dbrx-instruct................ 3.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.876 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.770 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.521 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.770\n",
      "Final Judgemark (cal)  = 0.817\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.812\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.065\n",
      "CI99 Overlap pct: \n",
      "0.545\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.236\n",
      "Average EMD across all pairs: 1.986\n",
      "Avg. CI95 half-width: 0.276 (modulated: 0.455)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.176\n",
      "CI99 Overlap pct: \n",
      "0.541\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.245\n",
      "Average EMD across all pairs: 2.290\n",
      "Avg. CI95 half-width: 0.291 (modulated: 0.417)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (8.220869565217392, 5.80125, 7.27375, 8.11125, 6.39, 5.24375, 4.78125, 3.944583333333333, 3.736666666666667, 6.517916666666666, 5.543333333333333, 8.309583333333332, 5.9470833333333335, 8.8825, 9.39625, 4.848333333333334, 5.3950000000000005) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8676470588235293\n",
      "k_tau (7.446300834558465, 4.5436269798063895, 6.228373102431448, 7.239020465175945, 5.013594241179416, 3.8764050190536556, 3.5040094460962896, 2.721210204036068, 2.520913984157169, 5.163433697095243, 4.118145852929785, 7.478034568536447, 4.878805960570984, 8.399918697566477, 9.08175425230716, 3.4782581517193267, 3.9846941198188115) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8529411764705882\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 139.7619, p=0.0000\n",
      "Kruskal-Wallis: 1126.6904, p=0.0000\n",
      "Pearson r=0.9509\n",
      "Kendall τ=0.8618\n",
      "Std.Dev across models: 1.6457\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.435 ±0.128\n",
      "gpt-4o-2024-11-20....................... 8.832 ±0.208\n",
      "gemini-1.5-pro-002...................... 8.224 ±0.213\n",
      "claude-3-5-sonnet-20240620.............. 8.049 ±0.263\n",
      "gemini-1.5-pro-001...................... 7.963 ±0.235\n",
      "claude-3-opus-20240229.................. 7.010 ±0.314\n",
      "Llama-3-70b-chat-hf..................... 6.308 ±0.262\n",
      "Mistral-Large-Instruct-2411............. 5.936 ±0.370\n",
      "claude-3-haiku-20240307................. 5.877 ±0.288\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.783 ±0.297\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.319 ±0.305\n",
      "c4ai-command-r-08-2024.................. 5.193 ±0.296\n",
      "databricks/dbrx-instruct................ 5.135 ±0.277\n",
      "gpt-3.5-turbo-0125...................... 4.703 ±0.286\n",
      "Llama-2-13b-chat-hf..................... 4.520 ±0.327\n",
      "gemma-7b-it............................. 4.174 ±0.309\n",
      "gemma-2b-it............................. 3.969 ±0.314\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 174.5486, p=0.0000\n",
      "Kruskal-Wallis: 1126.6904, p=0.0000\n",
      "Pearson r=0.9483\n",
      "Kendall τ=0.8588\n",
      "Std.Dev across models: 1.9269\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.138 ±0.190\n",
      "gpt-4o-2024-11-20....................... 8.269 ±0.276\n",
      "gemini-1.5-pro-002...................... 7.387 ±0.281\n",
      "claude-3-5-sonnet-20240620.............. 7.189 ±0.332\n",
      "gemini-1.5-pro-001...................... 7.038 ±0.307\n",
      "claude-3-opus-20240229.................. 5.884 ±0.375\n",
      "Llama-3-70b-chat-hf..................... 4.977 ±0.284\n",
      "Mistral-Large-Instruct-2411............. 4.702 ±0.399\n",
      "claude-3-haiku-20240307................. 4.524 ±0.292\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.436 ±0.295\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.965 ±0.289\n",
      "c4ai-command-r-08-2024.................. 3.827 ±0.282\n",
      "databricks/dbrx-instruct................ 3.745 ±0.260\n",
      "gpt-3.5-turbo-0125...................... 3.346 ±0.258\n",
      "Llama-2-13b-chat-hf..................... 3.222 ±0.293\n",
      "gemma-7b-it............................. 2.891 ±0.270\n",
      "gemma-2b-it............................. 2.723 ±0.271\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.812\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.065\n",
      "CI99 Overlap pct: \n",
      "0.545\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.236\n",
      "Average EMD across all pairs: 1.986\n",
      "Avg. CI95 half-width: 0.276 (modulated: 0.455)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.750\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "12.176\n",
      "CI99 Overlap pct: \n",
      "0.541\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.245\n",
      "Average EMD across all pairs: 2.290\n",
      "Avg. CI95 half-width: 0.291 (modulated: 0.417)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.937\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.880 \n",
      "(0.8588235294117647)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 139.7619, p=0.0000\n",
      "Kruskal-Wallis: 1126.6904, p=0.0000\n",
      "Pearson r=0.9509\n",
      "Kendall τ=0.8618\n",
      "Std.Dev across models: 1.6457\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.435 ±0.128\n",
      "gpt-4o-2024-11-20....................... 8.832 ±0.208\n",
      "gemini-1.5-pro-002...................... 8.224 ±0.213\n",
      "claude-3-5-sonnet-20240620.............. 8.049 ±0.263\n",
      "gemini-1.5-pro-001...................... 7.963 ±0.235\n",
      "claude-3-opus-20240229.................. 7.010 ±0.314\n",
      "Llama-3-70b-chat-hf..................... 6.308 ±0.262\n",
      "Mistral-Large-Instruct-2411............. 5.936 ±0.370\n",
      "claude-3-haiku-20240307................. 5.877 ±0.288\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.783 ±0.297\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.319 ±0.305\n",
      "c4ai-command-r-08-2024.................. 5.193 ±0.296\n",
      "databricks/dbrx-instruct................ 5.135 ±0.277\n",
      "gpt-3.5-turbo-0125...................... 4.703 ±0.286\n",
      "Llama-2-13b-chat-hf..................... 4.520 ±0.327\n",
      "gemma-7b-it............................. 4.174 ±0.309\n",
      "gemma-2b-it............................. 3.969 ±0.314\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 174.5486, p=0.0000\n",
      "Kruskal-Wallis: 1126.6904, p=0.0000\n",
      "Pearson r=0.9483\n",
      "Kendall τ=0.8588\n",
      "Std.Dev across models: 1.9269\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.138 ±0.190\n",
      "gpt-4o-2024-11-20....................... 8.269 ±0.276\n",
      "gemini-1.5-pro-002...................... 7.387 ±0.281\n",
      "claude-3-5-sonnet-20240620.............. 7.189 ±0.332\n",
      "gemini-1.5-pro-001...................... 7.038 ±0.307\n",
      "claude-3-opus-20240229.................. 5.884 ±0.375\n",
      "Llama-3-70b-chat-hf..................... 4.977 ±0.284\n",
      "Mistral-Large-Instruct-2411............. 4.702 ±0.399\n",
      "claude-3-haiku-20240307................. 4.524 ±0.292\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.436 ±0.295\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.965 ±0.289\n",
      "c4ai-command-r-08-2024.................. 3.827 ±0.282\n",
      "databricks/dbrx-instruct................ 3.745 ±0.260\n",
      "gpt-3.5-turbo-0125...................... 3.346 ±0.258\n",
      "Llama-2-13b-chat-hf..................... 3.222 ±0.293\n",
      "gemma-7b-it............................. 2.891 ±0.270\n",
      "gemma-2b-it............................. 2.723 ±0.271\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.647\n",
      "Final Judgemark (cal)  = 0.658\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 2.924\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.290\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (6.78, 4.12, 3.78, 6.94, 2.99, 2.19, 2.01, 3.74, 0.91, 4.72, 3.41, 7.67, 6.08, 9.31, 9.56, 3.39, 3.08) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7205882352941176\n",
      "k_tau (7.0, 5.226666666666667, 5.0, 7.172661870503597, 2.8755760368663594, 1.7695852534562209, 1.5207373271889395, 4.885714285714287, 0.0, 5.626666666666667, 3.9428571428571435, 7.960431654676259, 6.533333333333333, 9.73021582733813, 10.0, 3.885714285714286, 3.0) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.7205882352941176\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.9137\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.4715\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.560 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.310 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.670 ±0.000\n",
      "gemini-1.5-pro-001...................... 6.940 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 6.780 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.080 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.720 ±0.000\n",
      "claude-3-haiku-20240307................. 4.120 ±0.000\n",
      "claude-3-opus-20240229.................. 3.780 ±0.000\n",
      "gemma-7b-it............................. 3.740 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.410 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.390 ±0.000\n",
      "databricks/dbrx-instruct................ 3.080 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.990 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 2.190 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 2.010 ±0.000\n",
      "gemma-2b-it............................. 0.910 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8873\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.7225\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.730 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.960 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.173 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.533 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.627 ±0.000\n",
      "claude-3-haiku-20240307................. 5.227 ±0.000\n",
      "claude-3-opus-20240229.................. 5.000 ±0.000\n",
      "gemma-7b-it............................. 4.886 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.943 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.886 ±0.000\n",
      "databricks/dbrx-instruct................ 3.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.876 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.770 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.521 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 2.924\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.000\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "0.000\n",
      "CI99 Overlap pct: \n",
      "0.000\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.000\n",
      "Average EMD across all pairs: 3.290\n",
      "Avg. CI95 half-width: 0.000 (modulated: 0.000)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.938\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 1.000 \n",
      "(0.7205882352941176)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.9137\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.4715\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 9.560 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.310 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.670 ±0.000\n",
      "gemini-1.5-pro-001...................... 6.940 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 6.780 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.080 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.720 ±0.000\n",
      "claude-3-haiku-20240307................. 4.120 ±0.000\n",
      "claude-3-opus-20240229.................. 3.780 ±0.000\n",
      "gemma-7b-it............................. 3.740 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.410 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.390 ±0.000\n",
      "databricks/dbrx-instruct................ 3.080 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.990 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 2.190 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 2.010 ±0.000\n",
      "gemma-2b-it............................. 0.910 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: inf, p=0.0000\n",
      "Kruskal-Wallis: 2039.0000, p=0.0000\n",
      "Pearson r=0.8873\n",
      "Kendall τ=0.7206\n",
      "Std.Dev across models: 2.7225\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 10.000 ±0.000\n",
      "gpt-4o-2024-11-20....................... 9.730 ±0.000\n",
      "gemini-1.5-pro-002...................... 7.960 ±0.000\n",
      "gemini-1.5-pro-001...................... 7.173 ±0.000\n",
      "claude-3-5-sonnet-20240620.............. 7.000 ±0.000\n",
      "Mistral-Large-Instruct-2411............. 6.533 ±0.000\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.627 ±0.000\n",
      "claude-3-haiku-20240307................. 5.227 ±0.000\n",
      "claude-3-opus-20240229.................. 5.000 ±0.000\n",
      "gemma-7b-it............................. 4.886 ±0.000\n",
      "c4ai-command-r-08-2024.................. 3.943 ±0.000\n",
      "gpt-3.5-turbo-0125...................... 3.886 ±0.000\n",
      "databricks/dbrx-instruct................ 3.000 ±0.000\n",
      "Llama-3-70b-chat-hf..................... 2.876 ±0.000\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 1.770 ±0.000\n",
      "Llama-2-13b-chat-hf..................... 1.521 ±0.000\n",
      "gemma-2b-it............................. 0.000 ±0.000\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.770\n",
      "Final Judgemark (cal)  = 0.817\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.662\n",
      "CI99 Overlap pct: \n",
      "0.721\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.188\n",
      "Average EMD across all pairs: 1.135\n",
      "Avg. CI95 half-width: 0.173 (modulated: 0.709)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "13.586\n",
      "CI99 Overlap pct: \n",
      "0.719\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.197\n",
      "Average EMD across all pairs: 1.951\n",
      "Avg. CI95 half-width: 0.271 (modulated: 0.460)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k_tau (7.514782608695652, 5.555, 6.242916666666667, 7.737916666666667, 6.082916666666667, 5.50375, 5.425, 5.32875, 5.334583333333334, 6.009166666666666, 5.797083333333333, 7.94, 6.3, 7.763333333333334, 8.193333333333333, 5.657916666666667, 5.797083333333333) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.848714265309258\n",
      "k_tau (7.114819971272161, 3.7866400647121776, 4.858965293368371, 7.591021334315976, 4.415268142990601, 3.599178958688356, 3.656883195032287, 3.4064084962858456, 3.363512206057602, 4.395906550294891, 4.062547933406487, 7.853191520408759, 4.985519118857515, 7.622259181521651, 8.2437820745949, 3.9086342919171844, 4.100223274860898) (1243, 1163, 1240, 1276, 1214, 1114, 1050, 1029, 989, 1147, 1159, 1333, 1246, 1402, 1430, 1099, 1102) 0.8382352941176471\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 131.4401, p=0.0000\n",
      "Kruskal-Wallis: 1091.1649, p=0.0000\n",
      "Pearson r=0.8928\n",
      "Kendall τ=0.7992\n",
      "Std.Dev across models: 0.9941\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.162 ±0.102\n",
      "gpt-4o-2024-11-20....................... 7.774 ±0.134\n",
      "gemini-1.5-pro-002...................... 7.769 ±0.145\n",
      "gemini-1.5-pro-001...................... 7.662 ±0.153\n",
      "claude-3-5-sonnet-20240620.............. 7.628 ±0.197\n",
      "claude-3-opus-20240229.................. 6.298 ±0.178\n",
      "Mistral-Large-Instruct-2411............. 6.092 ±0.193\n",
      "Llama-3-70b-chat-hf..................... 6.016 ±0.139\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.824 ±0.183\n",
      "claude-3-haiku-20240307................. 5.687 ±0.178\n",
      "databricks/dbrx-instruct................ 5.659 ±0.182\n",
      "c4ai-command-r-08-2024.................. 5.637 ±0.183\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.600 ±0.189\n",
      "gpt-3.5-turbo-0125...................... 5.510 ±0.189\n",
      "Llama-2-13b-chat-hf..................... 5.389 ±0.208\n",
      "gemma-7b-it............................. 5.352 ±0.199\n",
      "gemma-2b-it............................. 5.315 ±0.192\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 161.9000, p=0.0000\n",
      "Kruskal-Wallis: 1091.1649, p=0.0000\n",
      "Pearson r=0.8851\n",
      "Kendall τ=0.7971\n",
      "Std.Dev across models: 1.7305\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.231 ±0.143\n",
      "gpt-4o-2024-11-20....................... 7.631 ±0.221\n",
      "gemini-1.5-pro-002...................... 7.581 ±0.240\n",
      "gemini-1.5-pro-001...................... 7.430 ±0.254\n",
      "claude-3-5-sonnet-20240620.............. 7.340 ±0.319\n",
      "claude-3-opus-20240229.................. 4.962 ±0.310\n",
      "Mistral-Large-Instruct-2411............. 4.596 ±0.325\n",
      "Llama-3-70b-chat-hf..................... 4.364 ±0.218\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.162 ±0.286\n",
      "claude-3-haiku-20240307................. 3.913 ±0.277\n",
      "databricks/dbrx-instruct................ 3.881 ±0.280\n",
      "c4ai-command-r-08-2024.................. 3.856 ±0.282\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.793 ±0.284\n",
      "gpt-3.5-turbo-0125...................... 3.665 ±0.285\n",
      "Llama-2-13b-chat-hf..................... 3.525 ±0.314\n",
      "gemma-7b-it............................. 3.443 ±0.296\n",
      "gemma-2b-it............................. 3.363 ±0.278\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- RAW SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "8.662\n",
      "CI99 Overlap pct: \n",
      "0.721\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.188\n",
      "Average EMD across all pairs: 1.135\n",
      "Avg. CI95 half-width: 0.173 (modulated: 0.709)\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "--- CALIBRATED SEPARABILITY METRICS ---\n",
      "Adjacent 99% CI Overlap fraction: 0.875\n",
      "Sum of adjacent 99% CI Overlap magnitude (scale=1.5): \n",
      "13.586\n",
      "CI99 Overlap pct: \n",
      "0.719\n",
      "Avg. |Cohen's d| for adjacent pairs: 0.197\n",
      "Average EMD across all pairs: 1.951\n",
      "Avg. CI95 half-width: 0.271 (modulated: 0.460)\n",
      "\n",
      "Score stability (RAW)\n",
      "Randomized average Kendall's tau (raw): 0.938\n",
      "Score stability (CALIBRATED)\n",
      "Randomized average Kendall's tau (calibrated): 0.723 \n",
      "(0.7970588235294117)\n",
      "\n",
      "\n",
      "\n",
      "------- RAW SCORES Summary -------\n",
      "ANOVA F-value: 131.4401, p=0.0000\n",
      "Kruskal-Wallis: 1091.1649, p=0.0000\n",
      "Pearson r=0.8928\n",
      "Kendall τ=0.7992\n",
      "Std.Dev across models: 0.9941\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.162 ±0.102\n",
      "gpt-4o-2024-11-20....................... 7.774 ±0.134\n",
      "gemini-1.5-pro-002...................... 7.769 ±0.145\n",
      "gemini-1.5-pro-001...................... 7.662 ±0.153\n",
      "claude-3-5-sonnet-20240620.............. 7.628 ±0.197\n",
      "claude-3-opus-20240229.................. 6.298 ±0.178\n",
      "Mistral-Large-Instruct-2411............. 6.092 ±0.193\n",
      "Llama-3-70b-chat-hf..................... 6.016 ±0.139\n",
      "Mixtral-8x22B-Instruct-v0.1............. 5.824 ±0.183\n",
      "claude-3-haiku-20240307................. 5.687 ±0.178\n",
      "databricks/dbrx-instruct................ 5.659 ±0.182\n",
      "c4ai-command-r-08-2024.................. 5.637 ±0.183\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 5.600 ±0.189\n",
      "gpt-3.5-turbo-0125...................... 5.510 ±0.189\n",
      "Llama-2-13b-chat-hf..................... 5.389 ±0.208\n",
      "gemma-7b-it............................. 5.352 ±0.199\n",
      "gemma-2b-it............................. 5.315 ±0.192\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "------- CALIBRATED SCORES Summary -------\n",
      "ANOVA F-value: 161.9000, p=0.0000\n",
      "Kruskal-Wallis: 1091.1649, p=0.0000\n",
      "Pearson r=0.8851\n",
      "Kendall τ=0.7971\n",
      "Std.Dev across models: 1.7305\n",
      "\n",
      "Model Scores:\n",
      "DeepSeek-R1............................. 8.231 ±0.143\n",
      "gpt-4o-2024-11-20....................... 7.631 ±0.221\n",
      "gemini-1.5-pro-002...................... 7.581 ±0.240\n",
      "gemini-1.5-pro-001...................... 7.430 ±0.254\n",
      "claude-3-5-sonnet-20240620.............. 7.340 ±0.319\n",
      "claude-3-opus-20240229.................. 4.962 ±0.310\n",
      "Mistral-Large-Instruct-2411............. 4.596 ±0.325\n",
      "Llama-3-70b-chat-hf..................... 4.364 ±0.218\n",
      "Mixtral-8x22B-Instruct-v0.1............. 4.162 ±0.286\n",
      "claude-3-haiku-20240307................. 3.913 ±0.277\n",
      "databricks/dbrx-instruct................ 3.881 ±0.280\n",
      "c4ai-command-r-08-2024.................. 3.856 ±0.282\n",
      "Mixtral-8x7B-Instruct-v0.1.............. 3.793 ±0.284\n",
      "gpt-3.5-turbo-0125...................... 3.665 ±0.285\n",
      "Llama-2-13b-chat-hf..................... 3.525 ±0.314\n",
      "gemma-7b-it............................. 3.443 ±0.296\n",
      "gemma-2b-it............................. 3.363 ±0.278\n",
      "\n",
      "------------------------------------\n",
      "\n",
      "Final Judgemark (raw)   = 0.562\n",
      "Final Judgemark (cal)  = 0.551\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving results to ../outputs/my_judgemark_runs_20250729_050856_8b031c01-ac20-4f6d-8838-2d8481c61a55__deepseek_deepseek-r1.parquet\n",
      "| name          | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | human_correlation | human_correlation_calib |\n",
      "|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|\n",
      "| weighted      | 0.668           | 0.716                 | 0.895     | 0.879           | 0.558        | 0.635              | 0.881             | 0.876                   |\n",
      "| raw           | 0.678           | 0.731                 | 0.898     | 0.895           | 0.572        | 0.652              | 0.882             | 0.882                   |\n",
      "| ranked        | 0.77            | 0.817                 | 0.897     | 1.0             | 0.758        | 0.804              | 0.69              | 0.69                    |\n",
      "| ranked_norm   | 0.647           | 0.658                 | 0.895     | 0.8             | 0.535        | 0.576              | 0.846             | 0.843                   |\n",
      "| ranked_scaled | 0.77            | 0.817                 | 0.897     | 1.0             | 0.758        | 0.804              | 0.69              | 0.69                    |\n",
      "| weighted_norm | 0.562           | 0.551                 | 0.897     | 0.538           | 0.424        | 0.498              | 0.777             | 0.775                   |\n",
      "| name          | judgemark_score | judgemark_score_calib | stability | stability_calib | separability | separability_calib | human_correlation | human_correlation_calib |\n",
      "|---------------|-----------------|-----------------------|-----------|-----------------|--------------|--------------------|-------------------|-------------------------|\n",
      "| weighted      | 0.668           | 0.716                 | 0.895     | 0.879           | 0.558        | 0.635              | 0.881             | 0.876                   |\n",
      "| raw           | 0.678           | 0.731                 | 0.898     | 0.895           | 0.572        | 0.652              | 0.882             | 0.882                   |\n",
      "| ranked        | 0.77            | 0.817                 | 0.897     | 1.0             | 0.758        | 0.804              | 0.69              | 0.69                    |\n",
      "| ranked_norm   | 0.647           | 0.658                 | 0.895     | 0.8             | 0.535        | 0.576              | 0.846             | 0.843                   |\n",
      "| ranked_scaled | 0.77            | 0.817                 | 0.897     | 1.0             | 0.758        | 0.804              | 0.69              | 0.69                    |\n",
      "| weighted_norm | 0.562           | 0.551                 | 0.897     | 0.538           | 0.424        | 0.498              | 0.777             | 0.775                   |\n",
      "\n",
      "Run ID: 8b031c01-ac20-4f6d-8838-2d8481c61a55__deepseek_deepseek-r1\n",
      "\n",
      "\n",
      "Results saved to ../outputs/my_judgemark_runs_20250729_050856_8b031c01-ac20-4f6d-8838-2d8481c61a55__deepseek_deepseek-r1.parquet and ../outputs/my_judgemark_runs_20250729_050856.md\n",
      "\n",
      "\n",
      "Processing ../outputs/my_judgemark_runs_20250730_185640.json...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "364bd0ec8f7940008ad06a292f8566a1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Recomputing scores with choice norm:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Error processing run 4ca69750-c359-417d-a1da-c6c4d1c3f8c7__qwen_qwen3-235b-a22b: 'logp'\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing run 4ca69750-c359-417d-a1da-c6c4d1c3f8c7__qwen_qwen3-235b-a22b...\n"
     ]
    }
   ],
   "source": [
    "for f in fs:\n",
    "    print(f\"\\n\\nProcessing {f}...\")\n",
    "    try:\n",
    "        runs = load_json_file(f)\n",
    "    except Exception as e:\n",
    "        print(f\"Error loading {f}: {e}\")\n",
    "        continue\n",
    "\n",
    "\n",
    "    for run_id in tqdm(runs.keys(), desc=\"Recomputing scores with choice norm\"):\n",
    "        print(f\"\\nProcessing run {run_id}...\")\n",
    "        run = runs[run_id]\n",
    "        if (run is None) or len(run.get('results', {}))==0:\n",
    "            print(f\"Skipping run `{runs}` `{run_id}` as it is None.\")\n",
    "            continue\n",
    "        results = {}\n",
    "\n",
    "\n",
    "        try:\n",
    "            do_plot = 0\n",
    "            runs[run_id] = recompute_scores_with_choice_norm(run)\n",
    "            results['weighted'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_weighted\", do_plot=do_plot)\n",
    "            results['raw'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_raw\", do_plot=do_plot)\n",
    "            results['ranked'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_ranked\", do_plot=do_plot)\n",
    "            results['ranked_norm'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_ranked_norm\", do_plot=do_plot)\n",
    "            results['ranked_scaled'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_ranked\", do_plot=do_plot)\n",
    "            results['weighted_norm'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_weighted_norm\", do_plot=do_plot)\n",
    "\n",
    "            # More efficient Polars approach - create DataFrame directly from data\n",
    "            data = [\n",
    "                {\"name\": k, **v}\n",
    "                for k, v in results.items()\n",
    "            ]\n",
    "            df = pl.DataFrame(data)\n",
    "            # outfile = f.replace(\".json\", f\"_{run_id}.parquet\")\n",
    "            outfile = f.parent / (f.stem + f\"_{run_id}.parquet\")\n",
    "            print(f\"Saving results to {outfile}\")\n",
    "            s = df2md(df)\n",
    "            s += f\"\\n\\nRun ID: {run_id}\\n\\n\"\n",
    "            print(s)\n",
    "            df.write_parquet(outfile)\n",
    "            f_md = Path(f).with_suffix('.md')\n",
    "            f_md.open('a', encoding='utf-8').write(s + \"\\n\\n\")\n",
    "            print(f\"Results saved to {outfile} and {f_md}\")\n",
    "        except Exception as e:\n",
    "            # raise\n",
    "            logger.error(f\"Error processing run {run_id}: {e}\", backtrace=False, diagnose=False)\n",
    "            continue\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b8cc04c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa24faea",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "396407ca",
   "metadata": {},
   "source": [
    "# Experiment: Power-law and Softmax Sharpness\n",
    "Below we systematically retry the ranking method with different power transforms and the weighting method with varying softmax scales to compare their human correlation performance.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0fff4cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4ca69750-c359-417d-a1da-c6c4d1c3f8c7__qwen_qwen3-235b-a22b\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "'logp'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[11], line 9\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m run_id \u001b[38;5;129;01min\u001b[39;00m runs:\n\u001b[1;32m      7\u001b[0m     \u001b[38;5;66;03m# Normalize choices and recompute scores\u001b[39;00m\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;28mprint\u001b[39m(run_id)\n\u001b[0;32m----> 9\u001b[0m     runs[run_id] \u001b[38;5;241m=\u001b[39m \u001b[43mrecompute_scores_with_choice_norm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mruns\u001b[49m\u001b[43m[\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     11\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m ranking_powers:\n\u001b[1;32m     12\u001b[0m         res \u001b[38;5;241m=\u001b[39m finalize_scores_and_compute_judgemark(runs, run_id, samples_data,\n\u001b[1;32m     13\u001b[0m             score_key\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maggregated_ranked_norm_pow\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mp\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, do_plot\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n",
      "Cell \u001b[0;32mIn[6], line 8\u001b[0m, in \u001b[0;36mrecompute_scores_with_choice_norm\u001b[0;34m(run)\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item_id \u001b[38;5;129;01min\u001b[39;00m (run[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresults\u001b[39m\u001b[38;5;124m'\u001b[39m][model_name][iteration_key]\u001b[38;5;241m.\u001b[39mkeys()):\n\u001b[1;32m      7\u001b[0m     storage_dict \u001b[38;5;241m=\u001b[39m run[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresults\u001b[39m\u001b[38;5;124m'\u001b[39m][model_name][iteration_key][item_id]\n\u001b[0;32m----> 8\u001b[0m     logp \u001b[38;5;241m=\u001b[39m \u001b[43mstorage_dict\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlogp\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m      9\u001b[0m     lpv \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(logp\u001b[38;5;241m.\u001b[39mvalues())\n\u001b[1;32m     10\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(lpv) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "\u001b[0;31mKeyError\u001b[0m: 'logp'"
     ]
    }
   ],
   "source": [
    "# # Experiment: Power-law ranking and softmax weighting effects on human correlation\n",
    "# ranking_powers = [1, 2, 3]\n",
    "# weighting_scales = [1.0, 2.0, 5.0]\n",
    "\n",
    "# exp_results = []\n",
    "# for run_id in runs:\n",
    "#     # Normalize choices and recompute scores\n",
    "#     print(run_id)\n",
    "#     runs[run_id] = recompute_scores_with_choice_norm(runs[run_id])\n",
    "\n",
    "#     for p in ranking_powers:\n",
    "#         res = finalize_scores_and_compute_judgemark(runs, run_id, samples_data,\n",
    "#             score_key=f\"aggregated_ranked_norm_pow{p}\", do_plot=0)\n",
    "#         exp_results.append({\n",
    "#             'run_id': run_id,\n",
    "#             'method': f'ranked_pow{p}',\n",
    "#             'human_corr_calib': res['human_correlation_calib']\n",
    "#         })\n",
    "#     for s in weighting_scales:\n",
    "#         res = finalize_scores_and_compute_judgemark(runs, run_id, samples_data,\n",
    "#             score_key=f\"aggregated_weighted_norm_scale{s}\", do_plot=0)\n",
    "#         exp_results.append({\n",
    "#             'run_id': run_id,\n",
    "#             'method': f'weighted_scale{s}',\n",
    "#             'human_corr_calib': res['human_correlation_calib']\n",
    "#         })\n",
    "\n",
    "# # Display results in a DataFrame\n",
    "# import pandas as pd\n",
    "# from IPython.display import display\n",
    "\n",
    "# df_exp = pd.DataFrame(exp_results)\n",
    "# display(df_exp)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}