mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
1155 lines
57 KiB
JSON
1155 lines
57 KiB
JSON
{
|
|
"judge_model": "liquid/lfm-7b",
|
|
"start_time": "2025-01-31T07:33:36.255903",
|
|
"status": "completed",
|
|
"samples_file": "data/judgemark_v2.1_samples.json",
|
|
"prompts_file": "data/judge_prompts.json",
|
|
"errors": [
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "4",
|
|
"item_id": "43",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "4",
|
|
"item_id": "44",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "5",
|
|
"item_id": "2",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "5",
|
|
"item_id": "6",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "5",
|
|
"item_id": "9",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "5",
|
|
"item_id": "10",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "5",
|
|
"item_id": "19",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "5",
|
|
"item_id": "20",
|
|
"error": "'choices'"
|
|
},
|
|
{
|
|
"model": "Llama-2-13b-chat-hf",
|
|
"iteration": "5",
|
|
"item_id": "22",
|
|
"error": "'choices'"
|
|
}
|
|
],
|
|
"end_time": "2025-01-31T12:00:50.136221",
|
|
"raw_score_distribution": {
|
|
"count": 1251,
|
|
"min": 5.1,
|
|
"max": 9.75,
|
|
"mean": 7.708,
|
|
"median": 7.69,
|
|
"stdev": 0.932,
|
|
"p10": 6.41,
|
|
"p25": 7.06,
|
|
"p75": 8.43,
|
|
"p90": 9.0
|
|
},
|
|
"calibration_config": {
|
|
"method": "piecewise_landmark",
|
|
"in_landmarks": [
|
|
5.1,
|
|
7.06,
|
|
7.69,
|
|
8.43,
|
|
9.75
|
|
],
|
|
"out_landmarks": [
|
|
0,
|
|
3,
|
|
5,
|
|
7,
|
|
10
|
|
]
|
|
},
|
|
"calibrated_score_distribution": {
|
|
"count": 1251,
|
|
"min": 0.0,
|
|
"max": 10.0,
|
|
"mean": 5.049,
|
|
"median": 5.0,
|
|
"stdev": 2.309,
|
|
"p10": 2.005,
|
|
"p25": 3.0,
|
|
"p75": 7.0,
|
|
"p90": 8.295
|
|
},
|
|
"raw_model_stats": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"count": 73,
|
|
"mean": 7.701095890410959,
|
|
"median": 7.5,
|
|
"stdev": 0.8527220493543597,
|
|
"ci95": 0.1956149911159047,
|
|
"min": 6.27,
|
|
"max": 9.39,
|
|
"length_correlation": 0.13083887621106557
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"count": 73,
|
|
"mean": 7.565616438356164,
|
|
"median": 7.73,
|
|
"stdev": 1.0190872636026507,
|
|
"ci95": 0.23377927915303873,
|
|
"min": 5.73,
|
|
"max": 9.7,
|
|
"length_correlation": -0.005175969589265889
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"count": 76,
|
|
"mean": 7.649078947368421,
|
|
"median": 7.6,
|
|
"stdev": 0.9302704662359635,
|
|
"ci95": 0.20915030807544055,
|
|
"min": 5.1,
|
|
"max": 9.34,
|
|
"length_correlation": 0.21939945987546378
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"count": 67,
|
|
"mean": 7.805671641791045,
|
|
"median": 7.88,
|
|
"stdev": 0.9684104812754937,
|
|
"ci95": 0.23188793399624838,
|
|
"min": 5.8,
|
|
"max": 9.35,
|
|
"length_correlation": 0.02451169545086653
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"count": 65,
|
|
"mean": 7.858461538461539,
|
|
"median": 7.74,
|
|
"stdev": 0.8802081266120224,
|
|
"ci95": 0.21398570748046974,
|
|
"min": 6.12,
|
|
"max": 9.75,
|
|
"length_correlation": -0.012388130733235983
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"count": 81,
|
|
"mean": 7.588765432098765,
|
|
"median": 7.56,
|
|
"stdev": 0.9614681257275893,
|
|
"ci95": 0.20938639182511945,
|
|
"min": 5.59,
|
|
"max": 9.44,
|
|
"length_correlation": -0.19945199926634238
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"count": 84,
|
|
"mean": 7.682738095238095,
|
|
"median": 7.720000000000001,
|
|
"stdev": 0.8365928479434085,
|
|
"ci95": 0.17890833573943637,
|
|
"min": 5.79,
|
|
"max": 9.43,
|
|
"length_correlation": -0.035596689866570594
|
|
},
|
|
"gemma-7b-it": {
|
|
"count": 65,
|
|
"mean": 7.352461538461538,
|
|
"median": 7.32,
|
|
"stdev": 0.8968957415184031,
|
|
"ci95": 0.21804260149671584,
|
|
"min": 5.35,
|
|
"max": 9.58,
|
|
"length_correlation": -0.040844614323725756
|
|
},
|
|
"gemma-2b-it": {
|
|
"count": 73,
|
|
"mean": 7.685616438356164,
|
|
"median": 7.66,
|
|
"stdev": 0.969820983339723,
|
|
"ci95": 0.2224775625113228,
|
|
"min": 5.92,
|
|
"max": 9.61,
|
|
"length_correlation": -0.08352316213835849
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"count": 80,
|
|
"mean": 7.627875,
|
|
"median": 7.49,
|
|
"stdev": 0.9621754014547229,
|
|
"ci95": 0.2108459811852292,
|
|
"min": 5.42,
|
|
"max": 9.38,
|
|
"length_correlation": -0.09389264217317053
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"count": 70,
|
|
"mean": 7.727285714285714,
|
|
"median": 7.734999999999999,
|
|
"stdev": 0.9510267852010122,
|
|
"ci95": 0.22279210669545071,
|
|
"min": 6.05,
|
|
"max": 9.68,
|
|
"length_correlation": -0.04179559502296377
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"count": 71,
|
|
"mean": 8.037605633802817,
|
|
"median": 8.09,
|
|
"stdev": 0.8306872624326239,
|
|
"ci95": 0.1932255037230592,
|
|
"min": 5.99,
|
|
"max": 9.73,
|
|
"length_correlation": 0.026141853212790223
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"count": 73,
|
|
"mean": 7.81,
|
|
"median": 7.79,
|
|
"stdev": 0.9005677221743084,
|
|
"ci95": 0.20659081948893102,
|
|
"min": 5.67,
|
|
"max": 9.44,
|
|
"length_correlation": 0.07062193791441498
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"count": 81,
|
|
"mean": 7.942716049382716,
|
|
"median": 7.86,
|
|
"stdev": 0.969372235451479,
|
|
"ci95": 0.21110773127609989,
|
|
"min": 5.84,
|
|
"max": 9.62,
|
|
"length_correlation": 0.1724541073846693
|
|
},
|
|
"DeepSeek-R1": {
|
|
"count": 72,
|
|
"mean": 7.956388888888889,
|
|
"median": 7.945,
|
|
"stdev": 0.7805222259659984,
|
|
"ci95": 0.18029143589014293,
|
|
"min": 6.36,
|
|
"max": 9.69,
|
|
"length_correlation": 0.15244903351019207
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"count": 71,
|
|
"mean": 7.509154929577464,
|
|
"median": 7.34,
|
|
"stdev": 0.9291635969729108,
|
|
"ci95": 0.21613200561237975,
|
|
"min": 6.15,
|
|
"max": 9.5,
|
|
"length_correlation": 0.18239264706506514
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"count": 76,
|
|
"mean": 7.543421052631579,
|
|
"median": 7.485,
|
|
"stdev": 0.9968049660544821,
|
|
"ci95": 0.22410908795694523,
|
|
"min": 5.56,
|
|
"max": 9.5,
|
|
"length_correlation": 0.12110662342189459
|
|
}
|
|
},
|
|
"calibrated_model_stats": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"count": 73,
|
|
"mean": 4.969965390709031,
|
|
"median": 4.396825396825396,
|
|
"stdev": 2.1927579525253407,
|
|
"ci95": 0.5030200963225284,
|
|
"min": 1.790816326530612,
|
|
"max": 9.181818181818183,
|
|
"length_correlation": 0.15051743384350802
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"count": 73,
|
|
"mean": 4.765808429213518,
|
|
"median": 5.108108108108108,
|
|
"stdev": 2.433358696673129,
|
|
"ci95": 0.5582140630606762,
|
|
"min": 0.9642857142857155,
|
|
"max": 9.886363636363635,
|
|
"length_correlation": -0.0024198700694023183
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"count": 76,
|
|
"mean": 4.914445112283458,
|
|
"median": 4.7142857142857135,
|
|
"stdev": 2.2533670239473635,
|
|
"ci95": 0.5066186924890359,
|
|
"min": 0.0,
|
|
"max": 9.068181818181818,
|
|
"length_correlation": 0.2310780454466
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"count": 67,
|
|
"mean": 5.264100655358651,
|
|
"median": 5.513513513513512,
|
|
"stdev": 2.440801873924221,
|
|
"ci95": 0.5844551610934552,
|
|
"min": 1.0714285714285718,
|
|
"max": 9.09090909090909,
|
|
"length_correlation": 0.026660907269132846
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"count": 65,
|
|
"mean": 5.390621583478726,
|
|
"median": 5.135135135135135,
|
|
"stdev": 2.2445595997017906,
|
|
"ci95": 0.5456705742685962,
|
|
"min": 1.561224489795919,
|
|
"max": 10.0,
|
|
"length_correlation": -0.006891966479903672
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"count": 81,
|
|
"mean": 4.777783263232999,
|
|
"median": 4.587301587301585,
|
|
"stdev": 2.348520603256119,
|
|
"ci95": 0.5114555980424437,
|
|
"min": 0.7500000000000003,
|
|
"max": 9.295454545454545,
|
|
"length_correlation": -0.2133298043406683
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"count": 84,
|
|
"mean": 4.98007423007423,
|
|
"median": 5.081081081081081,
|
|
"stdev": 2.0959612335789277,
|
|
"ci95": 0.4482287136398612,
|
|
"min": 1.0561224489795924,
|
|
"max": 9.272727272727273,
|
|
"length_correlation": -0.03392947781096937
|
|
},
|
|
"gemma-7b-it": {
|
|
"count": 65,
|
|
"mean": 4.1646281217709795,
|
|
"median": 3.8253968253968265,
|
|
"stdev": 2.163709576506363,
|
|
"ci95": 0.5260152803781869,
|
|
"min": 0.38265306122448983,
|
|
"max": 9.613636363636363,
|
|
"length_correlation": -0.02170204375736526
|
|
},
|
|
"gemma-2b-it": {
|
|
"count": 73,
|
|
"mean": 4.995045888392268,
|
|
"median": 4.904761904761904,
|
|
"stdev": 2.384133765638286,
|
|
"ci95": 0.5469218319586979,
|
|
"min": 1.255102040816327,
|
|
"max": 9.68181818181818,
|
|
"length_correlation": -0.09684115058812237
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"count": 80,
|
|
"mean": 4.831268176803891,
|
|
"median": 4.365079365079364,
|
|
"stdev": 2.382432001468829,
|
|
"ci95": 0.5220734308914077,
|
|
"min": 0.48979591836734737,
|
|
"max": 9.15909090909091,
|
|
"length_correlation": -0.09430544331902674
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"count": 70,
|
|
"mean": 5.084562710072914,
|
|
"median": 5.121621621621621,
|
|
"stdev": 2.373800070115178,
|
|
"ci95": 0.5560978163017634,
|
|
"min": 1.4540816326530615,
|
|
"max": 9.84090909090909,
|
|
"length_correlation": -0.047559106544906644
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"count": 71,
|
|
"mean": 5.866860066055238,
|
|
"median": 6.081081081081081,
|
|
"stdev": 2.1011736426174057,
|
|
"ci95": 0.4887523305887859,
|
|
"min": 1.3622448979591846,
|
|
"max": 9.954545454545455,
|
|
"length_correlation": 0.03683370825762501
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"count": 73,
|
|
"mean": 5.297697333900857,
|
|
"median": 5.270270270270269,
|
|
"stdev": 2.247302960317985,
|
|
"ci95": 0.5155327564828395,
|
|
"min": 0.8724489795918372,
|
|
"max": 9.295454545454545,
|
|
"length_correlation": 0.07299341186039904
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"count": 81,
|
|
"mean": 5.655529799709694,
|
|
"median": 5.45945945945946,
|
|
"stdev": 2.3945863456718093,
|
|
"ci95": 0.5214876930574163,
|
|
"min": 1.1326530612244903,
|
|
"max": 9.704545454545453,
|
|
"length_correlation": 0.17912779980236543
|
|
},
|
|
"DeepSeek-R1": {
|
|
"count": 72,
|
|
"mean": 5.644360985134795,
|
|
"median": 5.689189189189189,
|
|
"stdev": 2.0388969120335774,
|
|
"ci95": 0.47096115866215643,
|
|
"min": 1.9285714285714297,
|
|
"max": 9.863636363636363,
|
|
"length_correlation": 0.16557332270222566
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"count": 71,
|
|
"mean": 4.539662353042635,
|
|
"median": 3.8888888888888884,
|
|
"stdev": 2.3149681706526994,
|
|
"ci95": 0.5384829057897075,
|
|
"min": 1.607142857142858,
|
|
"max": 9.431818181818182,
|
|
"length_correlation": 0.1891181384859882
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"count": 76,
|
|
"mean": 4.678235346468429,
|
|
"median": 4.34920634920635,
|
|
"stdev": 2.38470999070152,
|
|
"ci95": 0.5361481927335447,
|
|
"min": 0.7040816326530612,
|
|
"max": 9.431818181818182,
|
|
"length_correlation": 0.14905320053925192
|
|
}
|
|
},
|
|
"raw_cross_model_stats": {
|
|
"anova_f": 2.6505816884208975,
|
|
"anova_p": 0.0004038585700993472,
|
|
"kw_stat": 41.090479506975676,
|
|
"kw_p": 0.0005384578092164032,
|
|
"std_dev_across_models": 0.17269164814688245,
|
|
"pearson_r": 0.5373216525063995,
|
|
"kendall_tau": 0.3705882352941176,
|
|
"normalized_components": {
|
|
"pearson_r": 0.0,
|
|
"kendall_tau": 0.30065359477124176,
|
|
"anova_f": 0.007573090538345421,
|
|
"kw_stat": 0.02739365300465045,
|
|
"std_dev": 0.07849620370312838,
|
|
"ci99_overlap_magnitude_sum_norm": 0.5261289553841165,
|
|
"raw_score_range_norm": 0.08564301191765988,
|
|
"kendall_tau_bootstrapped": 0.12987581699346404
|
|
}
|
|
},
|
|
"calibrated_cross_model_stats": {
|
|
"anova_f": 2.5813753327730753,
|
|
"anova_p": 0.0005832247254779029,
|
|
"kw_stat": 41.090479506975676,
|
|
"kw_p": 0.0005384578092164032,
|
|
"std_dev_across_models": 0.4221436329281502,
|
|
"pearson_r": 0.5285851796518878,
|
|
"kendall_tau": 0.3676470588235293,
|
|
"normalized_components": {
|
|
"pearson_r": 0.0,
|
|
"kendall_tau": 0.2973856209150326,
|
|
"anova_f": 0.007375358093637358,
|
|
"kw_stat": 0.02739365300465045,
|
|
"std_dev": 0.19188346951279553,
|
|
"ci99_overlap_magnitude_sum_norm": 0.0,
|
|
"calibrated_score_range_norm": 0.21277899303553227,
|
|
"kendall_tau_bootstrapped": 0.1385849673202614
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"raw": {
|
|
"ci99_overlap_adjacent": {
|
|
"gemini-1.5-pro-002__DeepSeek-R1": true,
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": true,
|
|
"gpt-4o-2024-11-20__Llama-3-70b-chat-hf": true,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": true,
|
|
"gemma-2b-it__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__claude-3-opus-20240229": true,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 1.0,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 0.6550964273520838,
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.7108162710378636,
|
|
"gpt-4o-2024-11-20__Llama-3-70b-chat-hf": 0.7537313076723517,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.7806201012470888,
|
|
"Mistral-Large-Instruct-2411__gemini-1.5-pro-001": 0.8145041122710763,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 0.8179238984276935,
|
|
"c4ai-command-r-08-2024__claude-3-5-sonnet-20240620": 0.7712308566272537,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 0.7712308566272537,
|
|
"gemma-2b-it__Llama-2-13b-chat-hf": 0.7053632660920472,
|
|
"Llama-2-13b-chat-hf__claude-3-opus-20240229": 0.7053632660920472,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.8067338709423701,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": 0.7892936418181105,
|
|
"Mixtral-8x7B-Instruct-v0.1__claude-3-haiku-20240307": 0.8255259242257855,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.8804390074560677,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.8335802362129368,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": 0.6991941159109398
|
|
},
|
|
"ci99_overlap_magnitude_sum": 12.320647160012971,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.047219736223681474,
|
|
"emd": {
|
|
"average": 0.24452920750049187,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 0.217945205479452,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.131007570295602,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.17201390308730308,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.16782297154899872,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 0.1466446812108911,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 0.12227168949771695,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 0.35451422550052697,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 0.13219178082191774,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 0.1302106164383562,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 0.12031115459882571,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.3481670846999807,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.1521917808219178,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.2941890749196685,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.27750380517503803,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 0.20822882500482343,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 0.1772710886806057,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.2016636625811103,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 0.25802289920261695,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.2971696522655425,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.11776424826653142,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 0.19465264187866932,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 0.26882191780821924,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 0.14958904109589036,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.16981678082191787,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.16765362035225043,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 0.4719891954466524,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.25753424657534246,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 0.37961610011838315,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 0.416103500761035,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.18807640362724287,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.11992790194664744,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.20228397486252944,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.21213562753036436,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 0.12731481481481477,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 0.11978696741854639,
|
|
"claude-3-opus-20240229__gemma-7b-it": 0.32236639676113354,
|
|
"claude-3-opus-20240229__gemma-2b-it": 0.11230894015861574,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.12911184210526325,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 0.13752255639097744,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.38869347664937,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.1652937995674118,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.29576835607537366,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 0.31771929824561396,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 0.22449406968124525,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 0.15776315789473688,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 0.13851664753157283,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 0.22261838953381236,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 0.18709843638948115,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 0.4603421354764638,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 0.16955428337763234,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 0.18998694029850738,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 0.13944349680170579,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.2552259827622452,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.12173175219791438,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.15152754744794553,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 0.25304311774461025,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 0.322461635484549,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 0.2748468185388845,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.2696961063627728,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 0.17775641025641017,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 0.506,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 0.19294625922023179,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.23845192307692298,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.1597692307692307,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 0.1896663055254605,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.0832349841938882,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 0.16941690408357088,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 0.16111111111111118,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 0.3600325027085589,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.3189595141700404,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.16028659611992946,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.24501424501424512,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 0.11091493319803813,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.10518364197530863,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.14404761904761887,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 0.44884020170405137,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.22147809910366983,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 0.3539506172839505,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 0.3751234567901234,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.1531803164667015,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.0948765432098766,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3372985347985348,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.13128995433789958,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.16922023809523823,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.13254761904761903,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 0.35586016096579476,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 0.1388698630136987,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 0.27268518518518525,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 0.28488095238095235,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.23306338028169016,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.19751253132832086,
|
|
"gemma-7b-it__gemma-2b-it": 0.33417492096944157,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 0.281625,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 0.3748241758241759,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 0.6853001083423618,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 0.462082191780822,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 0.5903760683760684,
|
|
"gemma-7b-it__DeepSeek-R1": 0.6052435897435897,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.16902708559046598,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.22120647773279353,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 0.11537499999999994,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 0.07388845401174166,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 0.3621609106694964,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 0.1638356164383562,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 0.2607018433959073,
|
|
"gemma-2b-it__DeepSeek-R1": 0.33100076103500764,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 0.20718888674512825,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 0.14388248017303537,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.14341071428571434,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 0.4113397887323944,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.19798116438356164,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 0.31597376543209876,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 0.3840416666666668,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.15997007042253525,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.12582236842105263,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 0.33147484909456737,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.1523424657534247,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 0.2354091710758377,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 0.30169841269841274,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.22264989939637814,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.1844887218045113,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.2278641713293459,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.15994957398713266,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 0.13181142410015653,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 0.5349295774647886,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 0.494184581171238,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 0.16397260273972603,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 0.19687214611872147,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 0.336974725062705,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 0.27244051910598427,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.1829012345679013,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 0.44647017909928705,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 0.39929499675113717,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 0.4704381846635367,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 0.43884502923976615,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.14751297257227572
|
|
}
|
|
},
|
|
"average_ci95": 0.21048986960129026,
|
|
"modulated_ci95": 0.09424670983605522
|
|
},
|
|
"calibrated": {
|
|
"ci99_overlap_adjacent": {
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": true,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": true,
|
|
"DeepSeek-R1__Llama-3-70b-chat-hf": true,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__gemma-2b-it": true,
|
|
"gemma-2b-it__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 1.0,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 1.7801538075262586,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 1.8568095203805992,
|
|
"DeepSeek-R1__Llama-3-70b-chat-hf": 1.7503446355383687,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 1.9990236864220625,
|
|
"Mistral-Large-Instruct-2411__gemini-1.5-pro-001": 2.032537317991588,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.068831785396318,
|
|
"c4ai-command-r-08-2024__gemma-2b-it": 2.084863534732235,
|
|
"gemma-2b-it__Llama-2-13b-chat-hf": 1.767184676457525,
|
|
"Llama-2-13b-chat-hf__claude-3-5-sonnet-20240620": 1.767184676457525,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.9347785824574224,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.9446817245203993,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": 1.9839087283650176,
|
|
"Mixtral-8x7B-Instruct-v0.1__claude-3-haiku-20240307": 2.0164627299518187,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 2.0697409737645778,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 1.979845073473037,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": 1.7234088464114428
|
|
},
|
|
"ci99_overlap_magnitude_sum": 30.759760299846192,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.047034319758825216,
|
|
"emd": {
|
|
"average": 0.5843970377243549,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 0.4375350647366302,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.28880990753351443,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.42190114210150975,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.43803861465309546,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 0.2892253282736718,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 0.3067611463963157,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 0.8187006179178393,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 0.2820450765656243,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 0.27076204091859674,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 0.27592722810782616,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.9149236819780628,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.395455587236409,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.768128941663043,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.7248747005555549,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 0.4690261748846132,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 0.34114048919899176,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.42478806904211175,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 0.5421224730013466,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.6348504040284858,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.2629197487318819,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 0.395570081480062,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 0.6866351656958312,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 0.3140115234048696,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.3646554046994362,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.33504696310119847,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 1.1010516368417198,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5605569098719784,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 0.8954406638867027,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 0.936123124621983,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.4433265106000145,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.280405188995157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.4917627656447653,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.48491626940875066,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 0.27805779777723516,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 0.2695020154069564,
|
|
"claude-3-opus-20240229__gemma-7b-it": 0.7950063875815754,
|
|
"claude-3-opus-20240229__gemma-2b-it": 0.2427087905188121,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.3034924663214137,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 0.3184456512464031,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.9526702449171476,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.38994520162122426,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.7443468109891114,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 0.7535735017721233,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 0.5241026361234448,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 0.33764029017788416,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 0.3352906782544307,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 0.5020948194564239,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 0.46365219431012905,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 1.1156816975580301,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 0.39796098519117457,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 0.4640175011843455,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 0.3251483507194747,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.6556957533112973,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.3071304867022945,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.4165135287104803,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 0.6128191267006119,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 0.7742808739310134,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 0.6186872155338097,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.612838320245727,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 0.4163534092105519,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 1.2259934617077473,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 0.4412601476202258,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.5772292808007091,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.37113093745746784,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 0.495389552834221,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.1860643200173531,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 0.40777824957719166,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 0.3973388577555246,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 0.874303893820996,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.7212931196013901,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.3604209583045562,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.6302704776249753,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 0.25342128645382944,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.2372183738916543,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.32037518027691836,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 1.089076802822238,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.5202868225083197,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 0.8777465364766948,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 0.8836231764472502,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.3651364909282787,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.21785824031020365,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.831405149262292,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.2943658734125606,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.40897312466190044,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.300331115637238,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 0.8887829765237067,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 0.3370311605914067,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 0.6949054130648992,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 0.6898098430836526,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.5606946435129819,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.4342091212391964,
|
|
"gemma-7b-it__gemma-2b-it": 0.8321041428673522,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 0.680757187900045,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 0.9199345883019355,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 1.7025865192867204,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 1.143395871771606,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 1.4911779448816485,
|
|
"gemma-7b-it__DeepSeek-R1": 1.4827243163552688,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.39579696953942456,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.5618955904670189,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 0.2680608431709212,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 0.16847910053109938,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 0.8949259849593356,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 0.3872322267625594,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 0.6672048968381505,
|
|
"gemma-2b-it__DeepSeek-R1": 0.7861976206113519,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 0.5033659682856229,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 0.32067085688725405,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.3413880402400812,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.0391079531673093,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.49907046393103127,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 0.8259953719609803,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 0.9392923032803985,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.3643436449397213,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.25667548162848913,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 0.8285132412755292,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.353303316932056,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 0.6075833330557439,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 0.7247874525510581,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.5530089342822885,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.40774568212806683,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.5697445901409407,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.3604435726985843,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 0.3184203492930857,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 1.3385776811330126,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 1.1886247195868078,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 0.40762589916391456,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 0.4613983267562851,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 0.8219808335226872,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 0.6327837413814343,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.39237659591495583,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 1.1362714160985266,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 0.9772944532412647,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 1.1574355078924161,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 1.0249374408992205,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3239361845207443
|
|
}
|
|
},
|
|
"average_ci95": 0.5223609585741825,
|
|
"modulated_ci95": 0.0
|
|
}
|
|
},
|
|
"calibrated_score_range": 1.7022319442842582,
|
|
"iteration_stability": {
|
|
"raw": {
|
|
"scoring_stability": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"mean_iter_score": 7.693373870573871,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1958879471736889
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"mean_iter_score": 7.603844162210339,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.28443306778079225
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"mean_iter_score": 7.6355114035087714,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.10317165872193247
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"mean_iter_score": 7.807096153846154,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.31735096545125985
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"mean_iter_score": 7.871538494838495,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.17870722846118417
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"mean_iter_score": 7.6138087301587305,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.224725939588447
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"mean_iter_score": 7.704659125188537,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.20396947363429022
|
|
},
|
|
"gemma-7b-it": {
|
|
"mean_iter_score": 7.345348917748917,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.09332915477076936
|
|
},
|
|
"gemma-2b-it": {
|
|
"mean_iter_score": 7.6823210784313725,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.10720346597508101
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"mean_iter_score": 7.6574571637426905,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.20412213449322975
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"mean_iter_score": 7.728667582417582,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.09274205511840909
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"mean_iter_score": 8.042915592903828,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.21766212660563314
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"mean_iter_score": 7.804625,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.10705199281968857
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"mean_iter_score": 7.947030701754386,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.22061713416870427
|
|
},
|
|
"DeepSeek-R1": {
|
|
"mean_iter_score": 7.9532261422787744,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.19624957127352102
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"mean_iter_score": 7.510950183150183,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1253691700541422
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"mean_iter_score": 7.5479095238095235,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.22749089399557745
|
|
}
|
|
},
|
|
"ranking_stability": {
|
|
"pairwise_correlation": {
|
|
"1__vs__2": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.014705882352941175,
|
|
"p_value": 0.9676638168395322
|
|
},
|
|
"1__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.2794117647058823,
|
|
"p_value": 0.12878898834482916
|
|
},
|
|
"1__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.14705882352941174,
|
|
"p_value": 0.43968652710486045
|
|
},
|
|
"1__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.2794117647058823,
|
|
"p_value": 0.12878898834482916
|
|
},
|
|
"2__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.3529411764705882,
|
|
"p_value": 0.05175324945907756
|
|
},
|
|
"2__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.2794117647058823,
|
|
"p_value": 0.12878898834482916
|
|
},
|
|
"2__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.2647058823529411,
|
|
"p_value": 0.15133363966308636
|
|
},
|
|
"3__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": -0.10294117647058822,
|
|
"p_value": 0.5976439315443395
|
|
},
|
|
"3__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.2941176470588235,
|
|
"p_value": 0.10886459480774507
|
|
},
|
|
"4__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.16176470588235295,
|
|
"p_value": 0.3927276123262421
|
|
}
|
|
},
|
|
"average_kendall_tau": 0.19705882352941173
|
|
},
|
|
"randomized_average_kendall_tau_by_item": 0.21688823529411763
|
|
},
|
|
"calibrated": {
|
|
"scoring_stability": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"mean_iter_score": 4.953886904941327,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.48100202400499703
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"mean_iter_score": 4.864163712635215,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.7236316662147649
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"mean_iter_score": 4.877138970413004,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2622554604190239
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"mean_iter_score": 5.276732133874991,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.7705129938542279
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"mean_iter_score": 5.42342814215727,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.4657782068641192
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"mean_iter_score": 4.838113431421255,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.529862176595558
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"mean_iter_score": 5.037563133361453,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.5402285320946496
|
|
},
|
|
"gemma-7b-it": {
|
|
"mean_iter_score": 4.144649682669163,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2748519391084112
|
|
},
|
|
"gemma-2b-it": {
|
|
"mean_iter_score": 4.987300697305949,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2514208923342906
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"mean_iter_score": 4.907973033309813,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.5251096797435508
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"mean_iter_score": 5.089754202254203,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.24244610788276505
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"mean_iter_score": 5.876057630623457,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.5752660611084774
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"mean_iter_score": 5.288746081424653,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.25136810750352784
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"mean_iter_score": 5.668307803254545,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.5358769063662505
|
|
},
|
|
"DeepSeek-R1": {
|
|
"mean_iter_score": 5.635600896194345,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.5266271705214419
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"mean_iter_score": 4.543997322364669,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.32272723516418067
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"mean_iter_score": 4.690104537515252,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.5432838312148721
|
|
}
|
|
},
|
|
"ranking_stability": {
|
|
"pairwise_correlation": {
|
|
"1__vs__2": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.044117647058823525,
|
|
"p_value": 0.8393415533036079
|
|
},
|
|
"1__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.20588235294117643,
|
|
"p_value": 0.27056053596407176
|
|
},
|
|
"1__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.14705882352941174,
|
|
"p_value": 0.43968652710486045
|
|
},
|
|
"1__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.1764705882352941,
|
|
"p_value": 0.34884640290128766
|
|
},
|
|
"2__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.39705882352941174,
|
|
"p_value": 0.027329794647271987
|
|
},
|
|
"2__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.2205882352941176,
|
|
"p_value": 0.23617204064651248
|
|
},
|
|
"2__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.24999999999999997,
|
|
"p_value": 0.17665741934030035
|
|
},
|
|
"3__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": -0.08823529411764705,
|
|
"p_value": 0.6553075802476113
|
|
},
|
|
"3__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.2647058823529411,
|
|
"p_value": 0.15133363966308636
|
|
},
|
|
"4__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.14705882352941174,
|
|
"p_value": 0.43968652710486045
|
|
}
|
|
},
|
|
"average_kendall_tau": 0.1764705882352941
|
|
},
|
|
"randomized_average_kendall_tau_by_item": 0.22472647058823528
|
|
}
|
|
},
|
|
"final_judgemark_score": 0.10239147747204691,
|
|
"raw_score_range": 0.6851440953412791,
|
|
"final_judgemark_score_raw": 0.14327380753059965
|
|
} |