Files
Judgemark-v2lp/results/stats/deepseek-ai__deepseek-r1-distill-llama-70b.json
T
2025-01-31 18:03:33 +11:00

1128 lines
56 KiB
JSON

{
"judge_model": "deepseek/deepseek-r1-distill-llama-70b",
"start_time": "2025-01-29T21:45:47.210194",
"status": "completed",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"end_time": "2025-01-31T15:25:39.621045",
"raw_score_distribution": {
"count": 2040,
"min": 2.28,
"max": 8.93,
"mean": 6.459,
"median": 6.61,
"stdev": 1.138,
"p10": 4.86,
"p25": 5.535,
"p75": 7.36,
"p90": 7.89
},
"calibration_config": {
"method": "piecewise_landmark",
"in_landmarks": [
2.28,
5.535,
6.61,
7.36,
8.93
],
"out_landmarks": [
0,
3,
5,
7,
10
]
},
"calibrated_score_distribution": {
"count": 2040,
"min": 0.0,
"max": 10.0,
"mean": 5.095,
"median": 5.0,
"stdev": 2.134,
"p10": 2.378,
"p25": 3.004,
"p75": 7.0,
"p90": 8.013
},
"raw_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 7.2235,
"median": 7.27,
"stdev": 0.5600114794741775,
"ci95": 0.10019876712481712,
"min": 5.68,
"max": 8.43,
"length_correlation": 0.028816647486864848
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 6.310666666666667,
"median": 6.32,
"stdev": 0.7933523524778084,
"ci95": 0.14194874663728252,
"min": 4.43,
"max": 8.25,
"length_correlation": -0.060220417662502634
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 6.9744166666666665,
"median": 6.93,
"stdev": 0.8047169718366233,
"ci95": 0.14398213504150822,
"min": 3.96,
"max": 8.39,
"length_correlation": 0.06473846527866355
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 7.253833333333334,
"median": 7.36,
"stdev": 0.7485506256865239,
"ci95": 0.1339327006202176,
"min": 3.86,
"max": 8.43,
"length_correlation": -0.15008782471233406
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 6.695416666666667,
"median": 6.785,
"stdev": 0.8730232372790152,
"ci95": 0.15620367662607454,
"min": 4.39,
"max": 8.14,
"length_correlation": -0.05334307855140533
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 6.10175,
"median": 6.02,
"stdev": 0.9998627321754108,
"ci95": 0.17889814178826485,
"min": 4.11,
"max": 8.04,
"length_correlation": -0.15627404083999585
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 5.712166666666667,
"median": 5.59,
"stdev": 0.9559466102082413,
"ci95": 0.17104055057933967,
"min": 3.75,
"max": 8.07,
"length_correlation": -0.08390825876511446
},
"gemma-7b-it": {
"count": 120,
"mean": 5.20875,
"median": 5.18,
"stdev": 0.7814138903725905,
"ci95": 0.13981268473827932,
"min": 3.36,
"max": 7.54,
"length_correlation": -0.0585946086002926
},
"gemma-2b-it": {
"count": 120,
"mean": 5.29975,
"median": 5.154999999999999,
"stdev": 1.0112231236885127,
"ci95": 0.18093077373491145,
"min": 3.07,
"max": 8.32,
"length_correlation": 0.1301649291729386
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 6.182916666666666,
"median": 6.23,
"stdev": 0.986667760853735,
"ci95": 0.17653726186501642,
"min": 4.04,
"max": 8.36,
"length_correlation": -0.06847523825181423
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 6.011083333333334,
"median": 6.055,
"stdev": 0.9343993674393987,
"ci95": 0.16718525967993816,
"min": 3.93,
"max": 7.96,
"length_correlation": -0.024818196346216514
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 7.323916666666666,
"median": 7.39,
"stdev": 0.6624836079574535,
"ci95": 0.11853335724486405,
"min": 4.79,
"max": 8.68,
"length_correlation": -0.21786208804832544
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 6.80825,
"median": 6.82,
"stdev": 0.9195675477037649,
"ci95": 0.16453151041550626,
"min": 4.25,
"max": 8.64,
"length_correlation": 0.08589359108439931
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 7.503916666666667,
"median": 7.68,
"stdev": 0.6487185763615272,
"ci95": 0.11607048059697637,
"min": 5.36,
"max": 8.68,
"length_correlation": -0.010615212429474982
},
"DeepSeek-R1": {
"count": 120,
"mean": 7.617,
"median": 7.77,
"stdev": 0.6948811638926279,
"ci95": 0.12433001549481598,
"min": 5.5,
"max": 8.93,
"length_correlation": -0.019696338883851222
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 5.788083333333334,
"median": 5.61,
"stdev": 0.9529796666760729,
"ci95": 0.17050969702553154,
"min": 4.11,
"max": 7.64,
"length_correlation": 0.004944641490658726
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 5.7805,
"median": 5.68,
"stdev": 1.030556551928466,
"ci95": 0.1843899629568152,
"min": 2.28,
"max": 7.82,
"length_correlation": -0.14338900270530267
}
},
"calibrated_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 6.542123734755345,
"median": 6.76,
"stdev": 1.2721508405086994,
"ci95": 0.22761666588595203,
"min": 3.269767441860464,
"max": 9.044585987261147,
"length_correlation": 0.025261342199355
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 4.636368130704195,
"median": 4.46046511627907,
"stdev": 1.5733029857716891,
"ci95": 0.2814996214651453,
"min": 1.9815668202764976,
"max": 8.700636942675159,
"length_correlation": -0.07436072469851716
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 6.006738886116737,
"median": 5.853333333333332,
"stdev": 1.6926222446128587,
"ci95": 0.30284854567176667,
"min": 1.5483870967741935,
"max": 8.96815286624204,
"length_correlation": 0.05675896848058143
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 6.633947026489713,
"median": 7.0,
"stdev": 1.5575968713638635,
"ci95": 0.27868944103551707,
"min": 1.456221198156682,
"max": 9.044585987261147,
"length_correlation": -0.1689457108215192
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 5.462678843201853,
"median": 5.466666666666666,
"stdev": 1.7652800033125053,
"ci95": 0.3158486681881708,
"min": 1.9447004608294929,
"max": 8.490445859872612,
"length_correlation": -0.06405314590066087
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 4.361470805523643,
"median": 3.902325581395348,
"stdev": 1.8497263154753996,
"ci95": 0.33095803054428585,
"min": 1.6866359447004613,
"max": 8.29936305732484,
"length_correlation": -0.13632102863406825
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 3.6894883315915417,
"median": 3.102325581395349,
"stdev": 1.6555232732602032,
"ci95": 0.2962106974715381,
"min": 1.3548387096774193,
"max": 8.356687898089172,
"length_correlation": -0.10942449528580632
},
"gemma-7b-it": {
"count": 120,
"mean": 2.8818536198350464,
"median": 2.6728110599078336,
"stdev": 1.0675286818161855,
"ci95": 0.19100511633939674,
"min": 0.9953917050691244,
"max": 7.343949044585987,
"length_correlation": -0.045518150437232285
},
"gemma-2b-it": {
"count": 120,
"mean": 3.111781036402564,
"median": 2.649769585253456,
"stdev": 1.5471640954984252,
"ci95": 0.2768227805870777,
"min": 0.728110599078341,
"max": 8.834394904458598,
"length_correlation": 0.07826837196790958
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 4.497523131692841,
"median": 4.293023255813953,
"stdev": 1.8389936114927405,
"ci95": 0.3290377061466721,
"min": 1.6221198156682028,
"max": 8.910828025477706,
"length_correlation": -0.05959238395483197
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 4.176128426176911,
"median": 3.967441860465116,
"stdev": 1.6850868369445653,
"ci95": 0.30150029017019225,
"min": 1.5207373271889402,
"max": 8.146496815286625,
"length_correlation": -0.04255016676357964
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 6.768393029103499,
"median": 7.05732484076433,
"stdev": 1.4369046554185252,
"ci95": 0.25709486363392525,
"min": 2.313364055299539,
"max": 9.522292993630574,
"length_correlation": -0.2216318471818696
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 5.693062916851445,
"median": 5.56,
"stdev": 1.8596116410643733,
"ci95": 0.3327267397099848,
"min": 1.8156682027649769,
"max": 9.445859872611466,
"length_correlation": 0.03794450806760933
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 7.139224481203709,
"median": 7.611464968152865,
"stdev": 1.4091046099950555,
"ci95": 0.2521208043877451,
"min": 2.838709677419355,
"max": 9.522292993630574,
"length_correlation": -0.018834719742951377
},
"DeepSeek-R1": {
"count": 120,
"mean": 7.36723041747698,
"median": 7.7834394904458595,
"stdev": 1.4981224139291323,
"ci95": 0.26804811040427196,
"min": 2.967741935483871,
"max": 10.0,
"length_correlation": -0.034444445700050605
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 3.8213516990864336,
"median": 3.1395348837209305,
"stdev": 1.6490881827298234,
"ci95": 0.29505931368546634,
"min": 1.6866359447004613,
"max": 7.535031847133757,
"length_correlation": 0.013383240687352738
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 3.825369075359528,
"median": 3.269767441860464,
"stdev": 1.7053593878739786,
"ci95": 0.30512750976131486,
"min": 0.0,
"max": 7.8789808917197455,
"length_correlation": -0.13451908193809253
}
},
"raw_cross_model_stats": {
"anova_f": 98.62940740981419,
"anova_p": 3.3442089791963187e-239,
"kw_stat": 899.720012563311,
"kw_p": 3.1929218927670664e-181,
"std_dev_across_models": 0.7531250389851643,
"pearson_r": 0.9510799368864087,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8369331229546957,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.2817983068851834,
"kw_stat": 0.599813341708874,
"std_dev": 0.3423295631750746,
"ci99_overlap_magnitude_sum_norm": 0.7252395935014663,
"raw_score_range_norm": 0.30103125,
"kendall_tau_bootstrapped": 0.7728774509803921
}
},
"calibrated_cross_model_stats": {
"anova_f": 98.74106983762412,
"anova_p": 2.033904993012262e-239,
"kw_stat": 899.720012563311,
"kw_p": 3.1929218927670664e-181,
"std_dev_across_models": 1.4125338266635763,
"pearson_r": 0.952652337461069,
"kendall_tau": 0.8647058823529411,
"normalized_components": {
"pearson_r": 0.8421744582035635,
"kendall_tau": 0.849673202614379,
"anova_f": 0.2821173423932118,
"kw_stat": 0.599813341708874,
"std_dev": 0.6420608303016255,
"ci99_overlap_magnitude_sum_norm": 0.4740037453347099,
"calibrated_score_range_norm": 0.5606720997052417,
"kendall_tau_bootstrapped": 0.7636225490196077
}
},
"separability_metrics": {
"raw": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": true,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": true,
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-2b-it": true,
"gemma-2b-it__gemma-7b-it": true
},
"adjacent_overlap_fraction": 1.0,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.3608176685194211,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.28247406140666964,
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.42760248554448665,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.3950432457238424,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.23226997521286297,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.4420056459707382,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": 0.5194312790092237,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.20299725670857427,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5000808113294992,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": 0.6195024110748033,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.5915668284662479,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.44269737128262143,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.6722508277616459,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": 0.6323261045303354,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.2814236786222839,
"gemma-2b-it__gemma-7b-it": 0.5412809177986215
},
"ci99_overlap_magnitude_sum": 7.143770568961878,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.18057589252541995,
"emd": {
"average": 0.9235857843137256,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 0.9128333333333334,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.27475,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.14350000000000002,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.5280833333333332,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 1.12175,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 1.5113333333333334,
"claude-3-5-sonnet-20240620__gemma-7b-it": 2.0147500000000003,
"claude-3-5-sonnet-20240620__gemma-2b-it": 1.92375,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 1.0405833333333332,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 1.2124166666666667,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.13791666666666663,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.45025000000000004,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.29124999999999995,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.3965000000000001,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 1.4354166666666668,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 1.443,
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.6715833333333334,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 0.9526666666666668,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.39625,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.2559166666666666,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 0.5996666666666668,
"claude-3-haiku-20240307__gemma-7b-it": 1.1019166666666664,
"claude-3-haiku-20240307__gemma-2b-it": 1.01325,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.19774999999999995,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.3005833333333333,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 1.01325,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.52525,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 1.19325,
"claude-3-haiku-20240307__DeepSeek-R1": 1.3063333333333331,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.5225833333333334,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.5313333333333333,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.28275,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.2861666666666667,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 0.8751666666666666,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 1.2622499999999999,
"claude-3-opus-20240229__gemma-7b-it": 1.7656666666666667,
"claude-3-opus-20240229__gemma-2b-it": 1.674666666666667,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.7928333333333335,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 0.9633333333333334,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.34950000000000003,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.18600000000000003,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.5295,
"claude-3-opus-20240229__DeepSeek-R1": 0.6425833333333333,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 1.1888333333333334,
"claude-3-opus-20240229__databricks/dbrx-instruct": 1.1939166666666667,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 0.56725,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 1.15625,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 1.5416666666666665,
"gemini-1.5-pro-001__gemma-7b-it": 2.0450833333333334,
"gemini-1.5-pro-001__gemma-2b-it": 1.9540833333333332,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 1.0739166666666669,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 1.2439166666666668,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.08108333333333334,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.4579166666666667,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.2500833333333333,
"gemini-1.5-pro-001__DeepSeek-R1": 0.3631666666666666,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 1.4699166666666668,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 1.4733333333333334,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.5936666666666667,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 0.9832500000000002,
"Llama-3-70b-chat-hf__gemma-7b-it": 1.4866666666666668,
"Llama-3-70b-chat-hf__gemma-2b-it": 1.3986666666666667,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.5161666666666667,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.6843333333333333,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 0.6285000000000001,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.13266666666666665,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 0.8085,
"Llama-3-70b-chat-hf__DeepSeek-R1": 0.9215833333333334,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 0.9073333333333333,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.9149166666666667,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.39008333333333334,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.893,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 0.8066666666666666,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.10349999999999993,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.133,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 1.2221666666666666,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.7073333333333334,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 1.4021666666666666,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 1.5152499999999998,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.31366666666666665,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.32125000000000004,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.5034166666666666,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.41774999999999995,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.4730833333333333,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.3107500000000001,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 1.61175,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.096083333333333,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 1.79175,
"Llama-2-13b-chat-hf__DeepSeek-R1": 1.9048333333333332,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.11325000000000002,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.15699999999999997,
"gemma-7b-it__gemma-2b-it": 0.18400000000000002,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 0.9741666666666666,
"gemma-7b-it__c4ai-command-r-08-2024": 0.8023333333333333,
"gemma-7b-it__gemini-1.5-pro-002": 2.1151666666666666,
"gemma-7b-it__Mistral-Large-Instruct-2411": 1.5995,
"gemma-7b-it__gpt-4o-2024-11-20": 2.295166666666667,
"gemma-7b-it__DeepSeek-R1": 2.40825,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.5793333333333333,
"gemma-7b-it__databricks/dbrx-instruct": 0.58975,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 0.8831666666666664,
"gemma-2b-it__c4ai-command-r-08-2024": 0.7185,
"gemma-2b-it__gemini-1.5-pro-002": 2.0241666666666664,
"gemma-2b-it__Mistral-Large-Instruct-2411": 1.5084999999999997,
"gemma-2b-it__gpt-4o-2024-11-20": 2.2041666666666666,
"gemma-2b-it__DeepSeek-R1": 2.3172499999999996,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.5091666666666668,
"gemma-2b-it__databricks/dbrx-instruct": 0.5050833333333334,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.1836666666666666,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.141,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.6253333333333333,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 1.3210000000000002,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 1.4340833333333332,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.39783333333333326,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.4024166666666666,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 1.3128333333333335,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.7985,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 1.4928333333333335,
"c4ai-command-r-08-2024__DeepSeek-R1": 1.6059166666666664,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.22899999999999998,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.23958333333333337,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.5168333333333334,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.1818333333333333,
"gemini-1.5-pro-002__DeepSeek-R1": 0.29308333333333325,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 1.5358333333333334,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 1.5434166666666667,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 0.6956666666666667,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 0.8087499999999999,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.0201666666666667,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.02775,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.11974999999999997,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 1.7158333333333333,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 1.7234166666666666,
"DeepSeek-R1__gpt-3.5-turbo-0125": 1.8289166666666663,
"DeepSeek-R1__databricks/dbrx-instruct": 1.8365,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.10374999999999995
}
},
"average_ci95": 0.15111974836295056,
"modulated_ci95": 0.4497547349798526
},
"calibrated": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": true,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": true,
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": false,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-2b-it": true,
"gemma-2b-it__gemma-7b-it": true
},
"adjacent_overlap_fraction": 0.9375,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.7974019720301788,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.6329843434480908,
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.9217444915309505,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.8974005274976555,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.5103201282315251,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.9392322792279009,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": 1.0481512895329983,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.3512407382017013,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 1.0647059749985601,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": 1.1649959075955332,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 1.0614214031300642,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.8450847793440786,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 1.163299632361304,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 1.0337059853478872,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.5519124175900498,
"gemma-2b-it__gemma-7b-it": 0.6923007512290651
},
"ci99_overlap_magnitude_sum": 13.675902621297542,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.17631262163150935,
"emd": {
"average": 1.7262718743344407,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 1.9057556040511492,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.584429434625869,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.2747329715352708,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.079444891553492,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 2.1806529292317007,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 2.8526354031638026,
"claude-3-5-sonnet-20240620__gemma-7b-it": 3.660270114920298,
"claude-3-5-sonnet-20240620__gemma-2b-it": 3.4303426983527805,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.044600603062504,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 2.3659953085784333,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.285992296527271,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.9159397987956194,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6133440929992887,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.8301404411612459,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 2.7207720356689107,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 2.716754659395816,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.377590417470913,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.0063346561541815,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.8387232674107259,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.38183858491162054,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 0.9491090984757107,
"claude-3-haiku-20240307__gemma-7b-it": 1.7545145108691487,
"claude-3-haiku-20240307__gemma-2b-it": 1.5290456930277456,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.29715945448297154,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.46252845187335645,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.1320248983993038,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 1.0821940180980942,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.5028563504995143,
"claude-3-haiku-20240307__DeepSeek-R1": 2.7308622867727848,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.8150164316177618,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.8132283547077246,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.6302803369935599,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.5506652656491391,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 1.647572228058531,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.3172505545251942,
"claude-3-opus-20240229__gemma-7b-it": 3.1248852662816904,
"claude-3-opus-20240229__gemma-2b-it": 2.8949578497141726,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.5104446330721293,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.8306104599398259,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.7616541429867627,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.3474229402816279,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.132485595086973,
"claude-3-opus-20240229__DeepSeek-R1": 1.360491531360244,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.1876913344957405,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.181369810757208,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.179409504332407,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 2.2763164667417994,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 2.9444586948981706,
"gemini-1.5-pro-001__gemma-7b-it": 3.7520934066546654,
"gemini-1.5-pro-001__gemma-2b-it": 3.5221659900871485,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.139188871755397,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.458893869130006,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15584303021463583,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.9580213898636928,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.505277454713997,
"gemini-1.5-pro-001__DeepSeek-R1": 0.7332833909872676,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 2.816435573179009,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 2.808577951130185,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1012080376782092,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.7731905116103106,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.580825223366806,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.3566302908757213,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.9721620809357632,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.2865504170249418,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.3057141859016468,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.25186282428963325,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 1.6765456380018573,
"Llama-3-70b-chat-hf__DeepSeek-R1": 1.9045515742751278,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.6413271441154191,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.6373097678423245,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.6729378879448402,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.4796171856885967,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.2586069665733084,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.1736413888883199,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.2364072884311464,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 2.4069222235798557,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.3323601604829474,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 2.7777536756800663,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 3.0057596119533363,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.54011910643721,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.5361017301641152,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.8076347117564953,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.5878983779915254,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.8101853377357076,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.5051300218897612,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.078904697511957,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 2.0035745852599027,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 3.4497361496121677,
"Llama-2-13b-chat-hf__DeepSeek-R1": 3.677742085885438,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19501537600050606,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.262152036340902,
"gemma-7b-it__gemma-2b-it": 0.3156417022818033,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.6156695118577944,
"gemma-7b-it__c4ai-command-r-08-2024": 1.2942748063418643,
"gemma-7b-it__gemini-1.5-pro-002": 3.886539409268452,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.8112092970163975,
"gemma-7b-it__gpt-4o-2024-11-20": 4.2573708613686625,
"gemma-7b-it__DeepSeek-R1": 4.485376797641933,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.9394980792513868,
"gemma-7b-it__databricks/dbrx-instruct": 0.9601053172756335,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 1.3857420952902766,
"gemma-2b-it__c4ai-command-r-08-2024": 1.0780416572902702,
"gemma-2b-it__gemini-1.5-pro-002": 3.6566119927009346,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5812818804488806,
"gemma-2b-it__gpt-4o-2024-11-20": 4.027443444801145,
"gemma-2b-it__DeepSeek-R1": 4.255449381074416,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.7493795798813215,
"gemma-2b-it__databricks/dbrx-instruct": 0.7470607952261039,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.3323010035190021,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.270869897410658,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.195539785158604,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 2.641701349510869,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 2.8697072857841395,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6789364095649328,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.6721540563333127,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 2.592264602926588,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 1.5181633693227672,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 2.963096055026799,
"c4ai-command-r-08-2024__DeepSeek-R1": 3.1911019913000693,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.3603066810075281,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.369594453435925,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 1.077559411615112,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.3730687971507588,
"gemini-1.5-pro-002__DeepSeek-R1": 0.5988373883734809,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 2.9470413300170653,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 2.943023953743971,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 1.4461615643522647,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 1.6741675006255354,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.871711217765011,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.8676938414919166,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.24108087167378733,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 3.317872782117276,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 3.3138554058441816,
"DeepSeek-R1__gpt-3.5-turbo-0125": 3.5458787183905467,
"DeepSeek-R1__databricks/dbrx-instruct": 3.5418613421174516,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.16207163187521173
}
},
"average_ci95": 0.2848361708875543,
"modulated_ci95": 0.24267056424594924
}
},
"calibrated_score_range": 4.485376797641933,
"final_judgemark_score": 0.5967480197034433,
"iteration_stability": {
"raw": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 7.2235,
"iteration_count": 5,
"stdev_across_iters": 0.03617722580114012
},
"claude-3-haiku-20240307": {
"mean_iter_score": 6.310666666666666,
"iteration_count": 5,
"stdev_across_iters": 0.22339032233091735
},
"claude-3-opus-20240229": {
"mean_iter_score": 6.9744166666666665,
"iteration_count": 5,
"stdev_across_iters": 0.10051692781704866
},
"gemini-1.5-pro-001": {
"mean_iter_score": 7.253833333333334,
"iteration_count": 5,
"stdev_across_iters": 0.15804829606449067
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 6.695416666666667,
"iteration_count": 5,
"stdev_across_iters": 0.13199642251044374
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 6.10175,
"iteration_count": 5,
"stdev_across_iters": 0.166703620903153
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 5.712166666666667,
"iteration_count": 5,
"stdev_across_iters": 0.11729544842926452
},
"gemma-7b-it": {
"mean_iter_score": 5.20875,
"iteration_count": 5,
"stdev_across_iters": 0.20389335447728557
},
"gemma-2b-it": {
"mean_iter_score": 5.29975,
"iteration_count": 5,
"stdev_across_iters": 0.163325807649754
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 6.182916666666666,
"iteration_count": 5,
"stdev_across_iters": 0.16633571307582892
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 6.011083333333334,
"iteration_count": 5,
"stdev_across_iters": 0.12852626190782962
},
"gemini-1.5-pro-002": {
"mean_iter_score": 7.323916666666666,
"iteration_count": 5,
"stdev_across_iters": 0.1513684338882376
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 6.80825,
"iteration_count": 5,
"stdev_across_iters": 0.09038743767188488
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 7.503916666666667,
"iteration_count": 5,
"stdev_across_iters": 0.07038613736619805
},
"DeepSeek-R1": {
"mean_iter_score": 7.617,
"iteration_count": 5,
"stdev_across_iters": 0.04891702157736093
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 5.788083333333334,
"iteration_count": 5,
"stdev_across_iters": 0.09502397358327817
},
"databricks/dbrx-instruct": {
"mean_iter_score": 5.7805,
"iteration_count": 5,
"stdev_across_iters": 0.12437382048575249
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.8823529411764705,
"p_value": 3.5743855407137387e-09
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8529411764705882,
"p_value": 2.3940311991296275e-08
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8676470588235293,
"p_value": 9.575975226992579e-09
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8529411764705882,
"p_value": 2.3940311991296275e-08
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8676470588235293,
"p_value": 9.575975226992579e-09
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8676470588235293,
"p_value": 9.575975226992579e-09
}
},
"average_kendall_tau": 0.8852941176470588
},
"randomized_average_kendall_tau_by_item": 0.8637264705882353
},
"calibrated": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 6.542123734755345,
"iteration_count": 5,
"stdev_across_iters": 0.09236895222889248
},
"claude-3-haiku-20240307": {
"mean_iter_score": 4.636368130704195,
"iteration_count": 5,
"stdev_across_iters": 0.40876832801072965
},
"claude-3-opus-20240229": {
"mean_iter_score": 6.006738886116736,
"iteration_count": 5,
"stdev_across_iters": 0.1954030761915741
},
"gemini-1.5-pro-001": {
"mean_iter_score": 6.633947026489713,
"iteration_count": 5,
"stdev_across_iters": 0.3065079684952029
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 5.462678843201853,
"iteration_count": 5,
"stdev_across_iters": 0.2522382666581595
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 4.361470805523643,
"iteration_count": 5,
"stdev_across_iters": 0.29311810551982037
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 3.6894883315915417,
"iteration_count": 5,
"stdev_across_iters": 0.2158313173822275
},
"gemma-7b-it": {
"mean_iter_score": 2.8818536198350464,
"iteration_count": 5,
"stdev_across_iters": 0.2912302860275781
},
"gemma-2b-it": {
"mean_iter_score": 3.111781036402564,
"iteration_count": 5,
"stdev_across_iters": 0.27254835539472333
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 4.497523131692841,
"iteration_count": 5,
"stdev_across_iters": 0.31147925519246866
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 4.176128426176911,
"iteration_count": 5,
"stdev_across_iters": 0.2828102404064592
},
"gemini-1.5-pro-002": {
"mean_iter_score": 6.768393029103499,
"iteration_count": 5,
"stdev_across_iters": 0.3171448114140151
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 5.693062916851445,
"iteration_count": 5,
"stdev_across_iters": 0.18906079419021307
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 7.13922448120371,
"iteration_count": 5,
"stdev_across_iters": 0.15669333981560607
},
"DeepSeek-R1": {
"mean_iter_score": 7.36723041747698,
"iteration_count": 5,
"stdev_across_iters": 0.1125275570527182
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 3.8213516990864336,
"iteration_count": 5,
"stdev_across_iters": 0.15197794230446607
},
"databricks/dbrx-instruct": {
"mean_iter_score": 3.8253690753595277,
"iteration_count": 5,
"stdev_across_iters": 0.21301063774688453
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.8823529411764705,
"p_value": 3.5743855407137387e-09
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8676470588235293,
"p_value": 9.575975226992579e-09
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.8676470588235293,
"p_value": 9.575975226992579e-09
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8382352941176471,
"p_value": 5.634316092440314e-08
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8088235294117646,
"p_value": 2.674946328840178e-07
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8529411764705882,
"p_value": 2.3940311991296275e-08
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8823529411764705,
"p_value": 3.5743855407137387e-09
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
}
},
"average_kendall_tau": 0.8705882352941176
},
"randomized_average_kendall_tau_by_item": 0.8581735294117646
}
},
"raw_score_range": 2.40825,
"final_judgemark_score_raw": 0.5685782928140057,
"final_judgemark_score_elements_raw": {
"norm_stability_between_iterations": 0.7728774509803921,
"norm_correlation_with_lmsys_arena": 0.8725490196078431,
"norm_std_dev_between_models": 0.3423295631750746,
"norm_kruskall_wallis": 0.599813341708874,
"norm_ci99_adjacent_overlap": 0.7252395935014663,
"norm_score_range": 0.30103125,
"norm_intra_model_ci95": 0.4497547349798526,
"norm_earth_movers_distance": 0.2308964460784314
},
"final_judgemark_score_elements_calibrated": {
"norm_stability_between_iterations": 0.7636225490196077,
"norm_correlation_with_lmsys_arena": 0.849673202614379,
"norm_std_dev_between_models": 0.6420608303016255,
"norm_kruskall_wallis": 0.599813341708874,
"norm_ci99_adjacent_overlap": 0.4740037453347099,
"norm_score_range": 0.5606720997052417,
"norm_intra_model_ci95": 0.24267056424594924,
"norm_earth_movers_distance": {
"pearson_r": 0.8421744582035635,
"kendall_tau": 0.849673202614379,
"anova_f": 0.2821173423932118,
"kw_stat": 0.599813341708874,
"std_dev": 0.6420608303016255,
"ci99_overlap_magnitude_sum_norm": 0.4740037453347099,
"calibrated_score_range_norm": 0.5606720997052417,
"kendall_tau_bootstrapped": 0.7636225490196077
}
}
}