mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
427 lines
13 KiB
Plaintext
427 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "426cbec8",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The autoreload extension is already loaded. To reload it, use:\n",
|
|
" %reload_ext autoreload\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%load_ext autoreload\n",
|
|
"%autoreload 2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "bdc690c6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"from judgemark_v2lp.utils.file_io import load_json_file, save_json_file\n",
|
|
"from judgemark_v2lp.benchmark import sanitize_model_name, finalize_scores_and_compute_judgemark\n",
|
|
"import uuid\n",
|
|
"from tqdm import tqdm\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "14a6f25e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "edd6567d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"File ../data/judgemark_v2.1_samples.json not found, returning empty dict.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"files []\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from pathlib import Path\n",
|
|
"run_id=None\n",
|
|
"\n",
|
|
"samples_file = \"../data/judgemark_v2.1_samples.json\"\n",
|
|
"samples_data = load_json_file(samples_file)\n",
|
|
"fs = sorted(Path(\"../outputs\").glob(\"my_judgemark_runs*.json\"))\n",
|
|
"print('files', fs)\n",
|
|
"for f in fs:\n",
|
|
"\n",
|
|
" print(f\"Loading {f}\")\n",
|
|
" try:\n",
|
|
" runs = load_json_file(f)\n",
|
|
" print(f\"Run id's {runs.keys()} from {f}\")\n",
|
|
" # print({k:{kk: len(vv) for kk,vv in v['results'].items()} for k,v in runs.items()})\n",
|
|
" print({k: len(v['results']) for k,v in runs.items()})\n",
|
|
" print('\\n\\n')\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error loading {f}: {e} \\n\\n\")\n",
|
|
"\n",
|
|
"# run_id = list(runs.keys())[-2]\n",
|
|
"\n",
|
|
"# run_id= '5605a8be-4de3-4596-b4ed-f64dc91dedbb__deepseek_deepseek-r1'\n",
|
|
"\n",
|
|
"# _, judge_model = run_id.split(\"__\")\n",
|
|
"# judge_model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "43a61229",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"File ../outputs/my_judgemark_runs_20250728_184232.json not found, returning empty dict.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"run_id = 'db38b659-de7d-4bda-a749-86f9bea79dcf__qwen_qwen3-235b-a22b'\n",
|
|
"f = \"../outputs/my_judgemark_runs_20250728_184232.json\"\n",
|
|
"runs = load_json_file(f)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "bfedfcdc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from loguru import logger\n",
|
|
"import sys\n",
|
|
"logger.remove()\n",
|
|
"logger.add(sys.stderr, level=\"INFO\", format=\"{message}\")\n",
|
|
"\n",
|
|
"import os\n",
|
|
"os.chdir(\"../\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "337ad718",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "KeyError",
|
|
"evalue": "'db38b659-de7d-4bda-a749-86f9bea79dcf__qwen_qwen3-235b-a22b'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m run \u001b[38;5;241m=\u001b[39m \u001b[43mruns\u001b[49m\u001b[43m[\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 2\u001b[0m judge_model \u001b[38;5;241m=\u001b[39m run[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mjudge_model\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 3\u001b[0m run\u001b[38;5;241m.\u001b[39mkeys()\n",
|
|
"\u001b[0;31mKeyError\u001b[0m: 'db38b659-de7d-4bda-a749-86f9bea79dcf__qwen_qwen3-235b-a22b'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"run = runs[run_id]\n",
|
|
"judge_model = run['judge_model']\n",
|
|
"run.keys()\n",
|
|
"\n",
|
|
"judge_model, run_id"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a5b71c5b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from judgemark_v2lp.scoring import compute_ranked_score, compute_raw_score, compute_weighted_score\n",
|
|
"import numpy as np\n",
|
|
"from tqdm.auto import tqdm\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"results = {\n",
|
|
" \"published\": {\n",
|
|
" \"judgemark_score_calib\": 0.761,\n",
|
|
" \"stability_calib\": 0.894,\n",
|
|
" \"separability_calib\": 0.691,\n",
|
|
" \"human_correlation_calib\": 0.908,\n",
|
|
" }\n",
|
|
"}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4608bf44",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a5b71c5b",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Normal ranked"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "27907750",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"results['weighted'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_weighted\", do_plot=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0946ca00",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# sanitized_jm = sanitize_model_name(judge_model)\n",
|
|
"# base_id = run_id if run_id else str(uuid.uuid4())\n",
|
|
"# run_key = f\"{base_id}__{sanitized_jm}\"\n",
|
|
"# Compute final stats\n",
|
|
"results['raw'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_raw\", do_plot=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "eb7a8590",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"results['ranked'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_ranked\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c9a4c055",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Norm logprob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b5b5b0d2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for model_name in (run['results'].keys()):\n",
|
|
" logprobs = []\n",
|
|
" for iteration_key in (run['results'][model_name].keys()):\n",
|
|
" for item_id in (run['results'][model_name][iteration_key].keys()):\n",
|
|
" storage_dict = run['results'][model_name][iteration_key][item_id]\n",
|
|
" logp = storage_dict['logp']\n",
|
|
" lpv = list(logp.values())\n",
|
|
" if len(lpv) == 0:\n",
|
|
" continue\n",
|
|
" logprobs.append(np.stack(lpv))\n",
|
|
" \n",
|
|
" logprobs2 = np.concatenate(logprobs)\n",
|
|
" log_prob_mean = logprobs2.mean(0)\n",
|
|
"\n",
|
|
" for iteration_key in (run['results'][model_name].keys()):\n",
|
|
" for item_id in (run['results'][model_name][iteration_key].keys()):\n",
|
|
" storage_dict = run['results'][model_name][iteration_key][item_id]\n",
|
|
" logp_norm = {k: v - log_prob_mean for i, (k, v) in enumerate(storage_dict['logp'].items())}\n",
|
|
"\n",
|
|
" def store_or_delete(storage_dict, key, value):\n",
|
|
" if value is not None:\n",
|
|
" assert np.isfinite(value), f\"Score for {model_name} {iteration_key} {item_id} is not finite: {value}\"\n",
|
|
" storage_dict[key] = value\n",
|
|
" elif key in storage_dict:\n",
|
|
" del storage_dict[key]\n",
|
|
"\n",
|
|
" extracted_rscores_norm = compute_ranked_score(logp_norm)\n",
|
|
" ranked_score_norm = compute_raw_score(extracted_rscores_norm)\n",
|
|
" store_or_delete(storage_dict, \"aggregated_score_ranked_norm\", ranked_score_norm)\n",
|
|
"\n",
|
|
" extracted_rscores = compute_ranked_score(logp)\n",
|
|
" ranked_score = compute_raw_score(extracted_rscores)\n",
|
|
" store_or_delete(storage_dict, \"aggregated_score_ranked\", ranked_score)\n",
|
|
"\n",
|
|
" extracted_wscore = compute_weighted_score(logp_norm)\n",
|
|
" weighted_score_norm = compute_raw_score(extracted_wscore)\n",
|
|
" store_or_delete(storage_dict, \"aggregated_score_weighted_norm\", weighted_score_norm)\n",
|
|
"\n",
|
|
" # extracted_scores, logp = parse_scores('', logp_norm)\n",
|
|
" # extracted_raw_score_norm = compute_raw_score(extracted_scores)\n",
|
|
" # raw_score_norm = extracted_raw_score_norm\n",
|
|
" # store_or_delete(storage_dict, \"aggregated_score_raw_norm\", raw_score_norm)\n",
|
|
"\n",
|
|
" run['results'][model_name][iteration_key][item_id] = storage_dict\n",
|
|
"\n",
|
|
"runs[run_id] = run\n",
|
|
"\n",
|
|
"print('This one uses normalised logprobs to compute the ranked score')\n",
|
|
"results['ranked_norm'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_ranked_norm\", do_plot=1)\n",
|
|
"\n",
|
|
"results['ranked_scaled'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_ranked\", do_plot=1)\n",
|
|
"\n",
|
|
"# results['raw_norm'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_raw_norm\", do_plot=1)\n",
|
|
"\n",
|
|
"results['weighted_norm'] = finalize_scores_and_compute_judgemark(runs, run_id, samples_data, score_key=\"aggregated_score_weighted_norm\", do_plot=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "47ca7843",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from matplotlib import pyplot as plt\n",
|
|
"plt.bar(range(11), log_prob_mean)\n",
|
|
"plt.title('choice bias, for normalised logprobs')\n",
|
|
"plt.xlabel('choice index')\n",
|
|
"plt.ylabel('log probability')\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b5542d50",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import polars as pl\n",
|
|
"results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cc80b9de",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# More efficient Polars approach - create DataFrame directly from data\n",
|
|
"data = [\n",
|
|
" {\"name\": k, **v}\n",
|
|
" for k, v in results.items()\n",
|
|
"]\n",
|
|
"df = pl.DataFrame(data)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "05173e4d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from IPython.display import display, Markdown\n",
|
|
"\n",
|
|
"# Round numeric columns and configure display for markdown\n",
|
|
"df_display = df.select([\n",
|
|
" pl.col(\"name\"),\n",
|
|
" *[pl.col(c).round(3) for c in df.columns if c != \"name\" and df[c].dtype in [pl.Float64, pl.Float32]]\n",
|
|
"])\n",
|
|
"\n",
|
|
"with pl.Config(\n",
|
|
" tbl_formatting=\"MARKDOWN\",\n",
|
|
" tbl_hide_column_data_types=True,\n",
|
|
" tbl_hide_dataframe_shape=True,\n",
|
|
" tbl_width_chars=240, # Allow wider table\n",
|
|
" tbl_cols=-1, # Show all columns\n",
|
|
") as cfg:\n",
|
|
" print(df_display)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "49b521cc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_display.style.fmt_markdown(\"md\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7d5d75b4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"s = str(df_display)+\"\\n\\n\"\n",
|
|
"open(\"output.txt\", \"w\").write(s)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dde8b132",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import polars as pl\n",
|
|
"# # pd.DataFrame([r1, r2, r3]).rename(columns={0: \"raw\", 1: \"weighted\", 2: \"ranked\"}).round(3)\n",
|
|
"# df = pl.DataFrame([r1, r2, r3, r3_n]).rename({\"final_judgemark_score_raw\": \"raw\", \"final_judgemark_score_calibrated\": \"weighted\"})\n",
|
|
"# df = df.with_columns(names=pl.lit(pl.Series([\"raw\", \"weighted\", \"ranked\", \"ranked_normalized\"])))\n",
|
|
"# # df.style\n",
|
|
"# print(df)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.16"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|