Files
Judgemark-v2lp/results/stats/ministral__Ministral-3b-instruct.json
T
2025-01-31 18:03:33 +11:00

1128 lines
56 KiB
JSON

{
"judge_model": "mistralai/ministral-3b",
"start_time": "2025-01-29T15:16:13.210235",
"status": "completed",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"end_time": "2025-01-31T15:22:46.181462",
"raw_score_distribution": {
"count": 2040,
"min": 6.21,
"max": 9.86,
"mean": 8.438,
"median": 8.54,
"stdev": 0.629,
"p10": 7.75,
"p25": 8.18,
"p75": 8.86,
"p90": 9.07
},
"calibration_config": {
"method": "piecewise_landmark",
"in_landmarks": [
6.21,
8.18,
8.54,
8.86,
9.86
],
"out_landmarks": [
0,
3,
5,
7,
10
]
},
"calibrated_score_distribution": {
"count": 2040,
"min": 0.0,
"max": 10.0,
"mean": 4.928,
"median": 5.0,
"stdev": 2.185,
"p10": 2.345,
"p25": 3.0,
"p75": 7.0,
"p90": 7.63
},
"raw_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 8.532416666666666,
"median": 8.5,
"stdev": 0.5648395263047643,
"ci95": 0.10106261430969966,
"min": 6.64,
"max": 9.71,
"length_correlation": 0.04985478354425061
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 8.39525,
"median": 8.5,
"stdev": 0.6176044293011806,
"ci95": 0.11050345333083132,
"min": 6.32,
"max": 9.32,
"length_correlation": -0.017062013112114513
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 8.4215,
"median": 8.54,
"stdev": 0.6120136896365318,
"ci95": 0.1095031430831921,
"min": 6.43,
"max": 9.64,
"length_correlation": 0.18640189231874763
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 8.507833333333334,
"median": 8.61,
"stdev": 0.6078798854889614,
"ci95": 0.10876351167508073,
"min": 6.61,
"max": 9.61,
"length_correlation": 0.026274651455959155
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 8.4525,
"median": 8.5,
"stdev": 0.5488142297932238,
"ci95": 0.09819532495560701,
"min": 6.39,
"max": 9.61,
"length_correlation": -0.03695786897090497
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 8.420916666666667,
"median": 8.57,
"stdev": 0.6853631753113303,
"ci95": 0.12262703125911854,
"min": 6.29,
"max": 9.71,
"length_correlation": -0.14055861541447986
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 8.3295,
"median": 8.54,
"stdev": 0.7257414058983366,
"ci95": 0.1298516133825037,
"min": 6.21,
"max": 9.57,
"length_correlation": 0.04909190676026223
},
"gemma-2b-it": {
"count": 120,
"mean": 8.418416666666667,
"median": 8.57,
"stdev": 0.6841138484953225,
"ci95": 0.12240349832936953,
"min": 6.32,
"max": 9.32,
"length_correlation": -0.19698830804676526
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 8.418083333333334,
"median": 8.59,
"stdev": 0.6802598315168625,
"ci95": 0.12171392719757344,
"min": 6.21,
"max": 9.64,
"length_correlation": -0.09499038364263362
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 8.512333333333332,
"median": 8.59,
"stdev": 0.636395750944904,
"ci95": 0.11386564737570143,
"min": 6.5,
"max": 9.73,
"length_correlation": 0.09411804998428688
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 8.502916666666666,
"median": 8.52,
"stdev": 0.5063685106989285,
"ci95": 0.09060082220189906,
"min": 6.57,
"max": 9.79,
"length_correlation": 0.09889509343072495
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 8.480083333333333,
"median": 8.57,
"stdev": 0.580339651104642,
"ci95": 0.1038359385220655,
"min": 6.57,
"max": 9.86,
"length_correlation": 0.11063667194359553
},
"DeepSeek-R1": {
"count": 120,
"mean": 8.59725,
"median": 8.61,
"stdev": 0.45529003130426876,
"ci95": 0.08146172264850987,
"min": 6.79,
"max": 9.54,
"length_correlation": -0.04352091793549681
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 8.3685,
"median": 8.52,
"stdev": 0.6734866633529469,
"ci95": 0.12050205364778366,
"min": 6.5,
"max": 9.61,
"length_correlation": -0.030100336455022927
},
"gemma-7b-it": {
"count": 120,
"mean": 8.328416666666667,
"median": 8.46,
"stdev": 0.6447786354986633,
"ci95": 0.11536553573160625,
"min": 6.43,
"max": 9.32,
"length_correlation": -0.0030055408484553074
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 8.410666666666666,
"median": 8.48,
"stdev": 0.6658143851841948,
"ci95": 0.11912930890642687,
"min": 6.5,
"max": 9.64,
"length_correlation": 0.007416016420082435
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 8.344833333333334,
"median": 8.48,
"stdev": 0.7053623936974363,
"ci95": 0.1262053454530157,
"min": 6.32,
"max": 9.43,
"length_correlation": -0.01595056522428744
}
},
"calibrated_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 5.137510963056968,
"median": 4.777777777777782,
"stdev": 2.2315985125681475,
"ci95": 0.3992836359119931,
"min": 0.6548223350253803,
"max": 9.550000000000004,
"length_correlation": 0.04184667380145494
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 4.783023289152099,
"median": 4.777777777777782,
"stdev": 2.0853402229638354,
"ci95": 0.3731147075287808,
"min": 0.16751269035533045,
"max": 8.380000000000003,
"length_correlation": 0.006413216205751215
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 4.850987262643356,
"median": 5.0,
"stdev": 2.111533947611521,
"ci95": 0.37780135952128885,
"min": 0.33502538071065957,
"max": 9.340000000000003,
"length_correlation": 0.2125946486558844
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 5.185520586576426,
"median": 5.437500000000002,
"stdev": 2.151423641949216,
"ci95": 0.3849385314188644,
"min": 0.6091370558375641,
"max": 9.25,
"length_correlation": 0.0626416143021296
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 4.846935901955257,
"median": 4.777777777777782,
"stdev": 2.0032854395133577,
"ci95": 0.3584332439520849,
"min": 0.27411167512690315,
"max": 9.25,
"length_correlation": -0.04408206087010364
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 4.9412156537883085,
"median": 5.187500000000007,
"stdev": 2.367189129849507,
"ci95": 0.42354387553786954,
"min": 0.1218274111675128,
"max": 9.550000000000004,
"length_correlation": -0.15120146399822076
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 4.6702382144200065,
"median": 5.0,
"stdev": 2.2606403291153274,
"ci95": 0.40447987620304005,
"min": 0.0,
"max": 9.130000000000003,
"length_correlation": 0.005326757504722762
},
"gemma-2b-it": {
"count": 120,
"mean": 4.934818657172403,
"median": 5.187500000000007,
"stdev": 2.3340137934510907,
"ci95": 0.4176080547057795,
"min": 0.16751269035533045,
"max": 8.380000000000003,
"length_correlation": -0.23382644038427944
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 4.9373524628689625,
"median": 5.312500000000004,
"stdev": 2.281032322327936,
"ci95": 0.40812846673021913,
"min": 0.0,
"max": 9.340000000000003,
"length_correlation": -0.1101691285826916
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 5.237148782665916,
"median": 5.312500000000004,
"stdev": 2.265525313633951,
"ci95": 0.4053539108329185,
"min": 0.4416243654822336,
"max": 9.610000000000003,
"length_correlation": 0.11635605537674598
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 5.008553334743375,
"median": 4.888888888888891,
"stdev": 1.9899521206085744,
"ci95": 0.3560476105054353,
"min": 0.5482233502538076,
"max": 9.79,
"length_correlation": 0.12318680325253185
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 5.021675902425271,
"median": 5.187500000000007,
"stdev": 2.07676149839151,
"ci95": 0.3715797789475712,
"min": 0.5482233502538076,
"max": 10.0,
"length_correlation": 0.0823223592372879
},
"DeepSeek-R1": {
"count": 120,
"mean": 5.415038176818953,
"median": 5.437500000000002,
"stdev": 1.9274446696832293,
"ci95": 0.3448636084833422,
"min": 0.8832487309644672,
"max": 9.04,
"length_correlation": -0.05351117772594884
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 4.751278165538637,
"median": 4.888888888888891,
"stdev": 2.2709386479450844,
"ci95": 0.40632247923533654,
"min": 0.4416243654822336,
"max": 9.25,
"length_correlation": -0.010610384058446908
},
"gemma-7b-it": {
"count": 120,
"mean": 4.556976005828165,
"median": 4.555555555555564,
"stdev": 2.1587977553205104,
"ci95": 0.38625792770898354,
"min": 0.33502538071065957,
"max": 8.380000000000003,
"length_correlation": -0.014501787771161803
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 4.814644059033656,
"median": 4.666666666666673,
"stdev": 2.250342485520787,
"ci95": 0.4026373582011127,
"min": 0.4416243654822336,
"max": 9.340000000000003,
"length_correlation": 0.03469307325357682
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 4.683821430250049,
"median": 4.666666666666673,
"stdev": 2.2731134630357923,
"ci95": 0.4067116030279737,
"min": 0.16751269035533045,
"max": 8.71,
"length_correlation": 0.0032255211230608626
}
},
"raw_cross_model_stats": {
"anova_f": 1.7552806220806676,
"anova_p": 0.03172712494638926,
"kw_stat": 18.766818164696122,
"kw_p": 0.28094051316106705,
"std_dev_across_models": 0.07359722755045996,
"pearson_r": 0.4774351405851384,
"kendall_tau": 0.3593563136578234,
"normalized_components": {
"pearson_r": 0.0,
"kendall_tau": 0.288173681842026,
"anova_f": 0.00501508749165905,
"kw_stat": 0.012511212109797415,
"std_dev": 0.03345328525020907,
"ci99_overlap_magnitude_sum_norm": 0.7416739635937193,
"raw_score_range_norm": 0.03360416666666666,
"kendall_tau_bootstrapped": 0.0
}
},
"calibrated_cross_model_stats": {
"anova_f": 1.259657517304955,
"anova_p": 0.214684476805095,
"kw_stat": 18.766818164696122,
"kw_p": 0.28094051316106705,
"std_dev_across_models": 0.21691518252040787,
"pearson_r": 0.3745056186008571,
"kendall_tau": 0.25588235294117645,
"normalized_components": {
"pearson_r": 0.0,
"kendall_tau": 0.17320261437908493,
"anova_f": 0.003599021478014157,
"kw_stat": 0.012511212109797415,
"std_dev": 0.09859781023654902,
"ci99_overlap_magnitude_sum_norm": 0.09409548926636191,
"calibrated_score_range_norm": 0.10725777137384851,
"kendall_tau_bootstrapped": 0.0
}
},
"separability_metrics": {
"raw": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__gemini-1.5-pro-002": true,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": true,
"gpt-4o-2024-11-20__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": true,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true
},
"adjacent_overlap_fraction": 1.0,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__claude-3-5-sonnet-20240620": 0.29497651545323755,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 0.39844904606966836,
"c4ai-command-r-08-2024__gemini-1.5-pro-001": 0.4288105722391151,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.35720242768358546,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.35720242768358546,
"gpt-4o-2024-11-20__Llama-3-70b-chat-hf": 0.37068047944636007,
"Llama-3-70b-chat-hf__claude-3-opus-20240229": 0.3784355614361381,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 0.43172663997611593,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 0.4805281773302035,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 0.47986882702887,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.46735710765850946,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": 0.43567045902995716,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.4286306792015466,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.462667120072755,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": 0.48943122121728777,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.45483968503636163
},
"ci99_overlap_magnitude_sum": 6.716476946563297,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.02878652769724302,
"emd": {
"average": 0.11902696078431374,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 0.15149999999999997,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.14058333333333334,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.09391666666666668,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.09608333333333334,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 0.13999999999999993,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 0.2259166666666666,
"claude-3-5-sonnet-20240620__gemma-2b-it": 0.143,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 0.14966666666666661,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 0.09324999999999997,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.08133333333333334,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.11,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.11083333333333331,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 0.17708333333333326,
"claude-3-5-sonnet-20240620__gemma-7b-it": 0.20400000000000004,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.1279166666666666,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 0.20274999999999999,
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.055916666666666684,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 0.12224999999999994,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.0862500000000001,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.11866666666666667,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 0.09708333333333338,
"claude-3-haiku-20240307__gemma-2b-it": 0.09933333333333319,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.07999999999999996,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.1240833333333332,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 0.11366666666666668,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 0.09450000000000014,
"claude-3-haiku-20240307__DeepSeek-R1": 0.20199999999999996,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.08674999999999991,
"claude-3-haiku-20240307__gemma-7b-it": 0.07366666666666673,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.07741666666666665,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.08324999999999998,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.09283333333333343,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.07150000000000006,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 0.09458333333333346,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 0.10250000000000005,
"claude-3-opus-20240229__gemma-2b-it": 0.08841666666666664,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.06225000000000007,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 0.09149999999999998,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.09725000000000009,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.06841666666666674,
"claude-3-opus-20240229__DeepSeek-R1": 0.17741666666666678,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 0.08483333333333326,
"claude-3-opus-20240229__gemma-7b-it": 0.09641666666666662,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.05966666666666667,
"claude-3-opus-20240229__databricks/dbrx-instruct": 0.08933333333333338,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 0.10083333333333327,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 0.10891666666666674,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 0.1783333333333333,
"gemini-1.5-pro-001__gemma-2b-it": 0.11325,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 0.09208333333333334,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 0.0556666666666667,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.0974166666666667,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.06558333333333338,
"gemini-1.5-pro-001__DeepSeek-R1": 0.09474999999999999,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 0.14483333333333337,
"gemini-1.5-pro-001__gemma-7b-it": 0.18608333333333338,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.10833333333333334,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 0.16299999999999995,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.1427500000000001,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 0.15516666666666679,
"Llama-3-70b-chat-hf__gemma-2b-it": 0.14375000000000002,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.1215833333333334,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.11966666666666662,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 0.05858333333333335,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 0.06474999999999989,
"Llama-3-70b-chat-hf__DeepSeek-R1": 0.1489166666666666,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 0.1338333333333334,
"Llama-3-70b-chat-hf__gemma-7b-it": 0.13458333333333344,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.09683333333333352,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.1413333333333334,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.12025000000000005,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 0.08566666666666661,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.07466666666666671,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.10024999999999992,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 0.15650000000000008,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 0.12133333333333338,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 0.19133333333333336,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.07958333333333333,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.11683333333333341,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.1012500000000001,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.10041666666666671,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.10524999999999986,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.08991666666666667,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.18283333333333324,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 0.1827500000000001,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 0.15125000000000002,
"Llama-2-13b-chat-hf__DeepSeek-R1": 0.26825,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.07366666666666657,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.08175000000000004,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 0.10466666666666664,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.05149999999999997,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 0.05166666666666661,
"gemma-2b-it__c4ai-command-r-08-2024": 0.09924999999999991,
"gemma-2b-it__gemini-1.5-pro-002": 0.16233333333333333,
"gemma-2b-it__gpt-4o-2024-11-20": 0.1343333333333333,
"gemma-2b-it__DeepSeek-R1": 0.19900000000000004,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.07691666666666662,
"gemma-2b-it__gemma-7b-it": 0.11049999999999996,
"gemma-2b-it__Mistral-Large-Instruct-2411": 0.07624999999999992,
"gemma-2b-it__databricks/dbrx-instruct": 0.08874999999999987,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.09424999999999985,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 0.13416666666666677,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 0.10483333333333335,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 0.18383333333333335,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.07458333333333335,
"Mixtral-8x22B-Instruct-v0.1__gemma-7b-it": 0.10966666666666677,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.06908333333333337,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.07908333333333335,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 0.12808333333333335,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 0.08608333333333332,
"c4ai-command-r-08-2024__DeepSeek-R1": 0.1210833333333334,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.14383333333333326,
"c4ai-command-r-08-2024__gemma-7b-it": 0.18574999999999994,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.10766666666666658,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.16749999999999987,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.06633333333333341,
"gemini-1.5-pro-002__DeepSeek-R1": 0.11116666666666664,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 0.1520833333333333,
"gemini-1.5-pro-002__gemma-7b-it": 0.1745000000000001,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.12575000000000008,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 0.1675833333333334,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.1285,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 0.13075000000000003,
"gpt-4o-2024-11-20__gemma-7b-it": 0.15450000000000008,
"gpt-4o-2024-11-20__Mistral-Large-Instruct-2411": 0.09875000000000009,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 0.15175000000000008,
"DeepSeek-R1__gpt-3.5-turbo-0125": 0.2299166666666667,
"DeepSeek-R1__gemma-7b-it": 0.26883333333333337,
"DeepSeek-R1__Mistral-Large-Instruct-2411": 0.19191666666666668,
"DeepSeek-R1__databricks/dbrx-instruct": 0.2524166666666667,
"gpt-3.5-turbo-0125__gemma-7b-it": 0.06408333333333335,
"gpt-3.5-turbo-0125__Mistral-Large-Instruct-2411": 0.08066666666666664,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.055833333333333277,
"gemma-7b-it__Mistral-Large-Instruct-2411": 0.09791666666666674,
"gemma-7b-it__databricks/dbrx-instruct": 0.0670833333333334,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 0.07850000000000004
}
},
"average_ci95": 0.11150532305941084,
"modulated_ci95": 0.07196631924310755
},
"calibrated": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": true,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": true,
"gemini-1.5-pro-002__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__gemma-2b-it": true,
"gemma-2b-it__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true
},
"adjacent_overlap_fraction": 1.0,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__c4ai-command-r-08-2024": 1.3010128535183307,
"c4ai-command-r-08-2024__gemini-1.5-pro-001": 1.5062736644656383,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 1.4979259166107335,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 1.4037663983190969,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 1.4037518396915303,
"gemini-1.5-pro-002__Mixtral-8x7B-Instruct-v0.1": 1.4037518396915303,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 1.609085608494155,
"Mixtral-8x22B-Instruct-v0.1__gemma-2b-it": 1.609085608494155,
"gemma-2b-it__claude-3-opus-20240229": 1.4841575423215243,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 1.4131574282722532,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 1.4131574282722532,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": 1.4710405059760863,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.4710405059760863,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 1.5352756732913773,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": 1.5855168731947722,
"Llama-2-13b-chat-hf__gemma-7b-it": 1.445517592485067
},
"ci99_overlap_magnitude_sum": 23.55351727907459,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.02475642366827687,
"emd": {
"average": 0.3626417701386816,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 0.43411730353449873,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.45168573745064866,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.33206651861252134,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.37635716300056393,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 0.3572906796390299,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 0.5965551560443687,
"claude-3-5-sonnet-20240620__gemma-2b-it": 0.37155804662530545,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 0.40351498166948685,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 0.30738069890956954,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.3040958826847151,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.3958423928369994,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.4350274252679078,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 0.4623902049257377,
"claude-3-5-sonnet-20240620__gemma-7b-it": 0.5805349572288026,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.3571261632825714,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 0.5379487920661778,
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.18713541079150242,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 0.41721810960706873,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.2215283535438995,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.38719217663094585,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 0.2613591488061671,
"claude-3-haiku-20240307__gemma-2b-it": 0.36322608808046547,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.30254782383906736,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.4661296296296289,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 0.2523957863320174,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 0.2741526132731724,
"claude-3-haiku-20240307__DeepSeek-R1": 0.6320148876668548,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.2544789081594283,
"claude-3-haiku-20240307__gemma-7b-it": 0.24014626809550713,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.22304949238578686,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.22961523547659346,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.34517037977063386,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.2051155174844896,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 0.3124459132355712,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 0.24474904822335078,
"claude-3-opus-20240229__gemma-2b-it": 0.3421682999623988,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.24187770257567245,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 0.3871767484489565,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.2314734795074266,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.20405901015228467,
"claude-3-opus-20240229__DeepSeek-R1": 0.5690509141755975,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 0.2688759752773076,
"claude-3-opus-20240229__gemma-7b-it": 0.2995797847339726,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.1971247179921039,
"claude-3-opus-20240229__databricks/dbrx-instruct": 0.23472601757849254,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 0.4093511820830978,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 0.3119305673058851,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 0.5152823721564203,
"gemini-1.5-pro-001__gemma-2b-it": 0.3427854742432789,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 0.25570979037413066,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 0.17019663000564017,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.3274785321489002,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.2419865341229554,
"gemini-1.5-pro-001__DeepSeek-R1": 0.2477828186689228,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 0.4426180555555557,
"gemini-1.5-pro-001__gemma-7b-it": 0.6386968650122207,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.402653177288964,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 0.5016991563263771,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.44780955066741907,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 0.3695171319796961,
"Llama-3-70b-chat-hf__gemma-2b-it": 0.4758672447828536,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.4008936242714798,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.48939505781161813,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 0.19080261797330317,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 0.23626284310960668,
"Llama-3-70b-chat-hf__DeepSeek-R1": 0.5806022748636959,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 0.3655258507238205,
"Llama-3-70b-chat-hf__gemma-7b-it": 0.34593345788682134,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.26219370417371735,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.3397626198533561,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.3263120887384847,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 0.2229992244782851,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.19188373049445406,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.31627830654258293,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 0.39125434762173394,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 0.34766879347621776,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 0.5307391896973122,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.24908159428463977,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.42129548552359475,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.31212461223914334,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.30587608103026903,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.33369258789246026,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.27183318057905614,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.5669105682459105,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 0.3925975277307772,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 0.3550626880052646,
"Llama-2-13b-chat-hf__DeepSeek-R1": 0.7462999623989475,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.20679020962586914,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.24752849454784764,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 0.2797067705395754,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.16468520868584305,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 0.17030453327693146,
"gemma-2b-it__c4ai-command-r-08-2024": 0.3237467921601801,
"gemma-2b-it__gemini-1.5-pro-002": 0.44006801090430514,
"gemma-2b-it__gpt-4o-2024-11-20": 0.41914891191953363,
"gemma-2b-it__DeepSeek-R1": 0.5634695196465503,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.2440671883812745,
"gemma-2b-it__gemma-7b-it": 0.4144378760105282,
"gemma-2b-it__Mistral-Large-Instruct-2411": 0.31339042818198876,
"gemma-2b-it__databricks/dbrx-instruct": 0.30816483361534064,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.29979631979695387,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 0.33942772372626484,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 0.30401788400075214,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 0.493310713949991,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.2307925714420005,
"Mixtral-8x22B-Instruct-v0.1__gemma-7b-it": 0.4108333098326757,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.26938381274675716,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.2647432905621358,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 0.42288140392931006,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 0.323571583004324,
"c4ai-command-r-08-2024__DeepSeek-R1": 0.31563939415303666,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.4858706171272793,
"c4ai-command-r-08-2024__gemma-7b-it": 0.6829646550103402,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.43582705865764193,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5533273524158673,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.1858397842639599,
"gemini-1.5-pro-002__DeepSeek-R1": 0.4436904258319233,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 0.3477288729084415,
"gemini-1.5-pro-002__gemma-7b-it": 0.4515773289152101,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.3014509423763868,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 0.36422264523406717,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.427362274393683,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 0.35389773688663334,
"gpt-4o-2024-11-20__gemma-7b-it": 0.4731998965971054,
"gpt-4o-2024-11-20__Mistral-Large-Instruct-2411": 0.31074017672494864,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 0.4046878055085549,
"DeepSeek-R1__gpt-3.5-turbo-0125": 0.6672600112803163,
"DeepSeek-R1__gemma-7b-it": 0.8580621709907883,
"DeepSeek-R1__Mistral-Large-Instruct-2411": 0.6163941177852981,
"DeepSeek-R1__databricks/dbrx-instruct": 0.7312167465689042,
"gpt-3.5-turbo-0125__gemma-7b-it": 0.2308503830607257,
"gpt-3.5-turbo-0125__Mistral-Large-Instruct-2411": 0.24521551278435805,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.20100625117503285,
"gemma-7b-it__Mistral-Large-Instruct-2411": 0.2815259212257948,
"gemma-7b-it__databricks/dbrx-instruct": 0.2066912601052831,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 0.19484135880804715
}
},
"average_ci95": 0.38982976637956435,
"modulated_ci95": 0.012413331631055858
}
},
"calibrated_score_range": 0.8580621709907881,
"final_judgemark_score": 0.0750377754134345,
"iteration_stability": {
"raw": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 8.532416666666666,
"iteration_count": 5,
"stdev_across_iters": 0.0897114974670342
},
"claude-3-haiku-20240307": {
"mean_iter_score": 8.39525,
"iteration_count": 5,
"stdev_across_iters": 0.0628079082069554
},
"claude-3-opus-20240229": {
"mean_iter_score": 8.4215,
"iteration_count": 5,
"stdev_across_iters": 0.07418754387811079
},
"gemini-1.5-pro-001": {
"mean_iter_score": 8.507833333333334,
"iteration_count": 5,
"stdev_across_iters": 0.07634961616726652
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 8.4525,
"iteration_count": 5,
"stdev_across_iters": 0.06971370023173344
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 8.420916666666667,
"iteration_count": 5,
"stdev_across_iters": 0.09334315424520662
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 8.3295,
"iteration_count": 5,
"stdev_across_iters": 0.11244609819820338
},
"gemma-2b-it": {
"mean_iter_score": 8.418416666666667,
"iteration_count": 5,
"stdev_across_iters": 0.11442167384042447
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 8.418083333333334,
"iteration_count": 5,
"stdev_across_iters": 0.21919907871866418
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 8.512333333333334,
"iteration_count": 5,
"stdev_across_iters": 0.11698931575148222
},
"gemini-1.5-pro-002": {
"mean_iter_score": 8.502916666666666,
"iteration_count": 5,
"stdev_across_iters": 0.03367429781500011
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 8.480083333333333,
"iteration_count": 5,
"stdev_across_iters": 0.04818900865919065
},
"DeepSeek-R1": {
"mean_iter_score": 8.59725,
"iteration_count": 5,
"stdev_across_iters": 0.047086356835075165
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 8.3685,
"iteration_count": 5,
"stdev_across_iters": 0.1200348907609784
},
"gemma-7b-it": {
"mean_iter_score": 8.328416666666666,
"iteration_count": 5,
"stdev_across_iters": 0.07448219399441885
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 8.410666666666668,
"iteration_count": 5,
"stdev_across_iters": 0.09290437138381694
},
"databricks/dbrx-instruct": {
"mean_iter_score": 8.344833333333334,
"iteration_count": 5,
"stdev_across_iters": 0.07452562944681813
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.2352941176470588,
"p_value": 0.20489900152579948
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": -0.1176470588235294,
"p_value": 0.5423278567488096
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.08823529411764705,
"p_value": 0.6553075802476113
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.39705882352941174,
"p_value": 0.027329794647271987
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.0588235294117647,
"p_value": 0.7764940676883935
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.2352941176470588,
"p_value": 0.20489900152579948
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.16176470588235295,
"p_value": 0.3927276123262421
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.49999999999999994,
"p_value": 0.004507537046975066
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.16176470588235295,
"p_value": 0.3927276123262421
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.13235294117647056,
"p_value": 0.4896069682169023
}
},
"average_kendall_tau": 0.1852941176470588
},
"randomized_average_kendall_tau_by_item": 0.11921470588235293
},
"calibrated": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 5.137510963056967,
"iteration_count": 5,
"stdev_across_iters": 0.3049107761812058
},
"claude-3-haiku-20240307": {
"mean_iter_score": 4.783023289152099,
"iteration_count": 5,
"stdev_across_iters": 0.18941032492266185
},
"claude-3-opus-20240229": {
"mean_iter_score": 4.850987262643356,
"iteration_count": 5,
"stdev_across_iters": 0.27070744499420324
},
"gemini-1.5-pro-001": {
"mean_iter_score": 5.185520586576426,
"iteration_count": 5,
"stdev_across_iters": 0.23528886333424898
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 4.846935901955258,
"iteration_count": 5,
"stdev_across_iters": 0.2232487225893618
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 4.9412156537883085,
"iteration_count": 5,
"stdev_across_iters": 0.3710625536065223
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 4.6702382144200065,
"iteration_count": 5,
"stdev_across_iters": 0.3172256952979703
},
"gemma-2b-it": {
"mean_iter_score": 4.934818657172403,
"iteration_count": 5,
"stdev_across_iters": 0.3519141055787211
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 4.9373524628689625,
"iteration_count": 5,
"stdev_across_iters": 0.6471020486835078
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 5.237148782665916,
"iteration_count": 5,
"stdev_across_iters": 0.33738781776303084
},
"gemini-1.5-pro-002": {
"mean_iter_score": 5.008553334743375,
"iteration_count": 5,
"stdev_across_iters": 0.25561531303926527
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 5.021675902425271,
"iteration_count": 5,
"stdev_across_iters": 0.16587804337958673
},
"DeepSeek-R1": {
"mean_iter_score": 5.415038176818953,
"iteration_count": 5,
"stdev_across_iters": 0.2628639919083219
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 4.751278165538637,
"iteration_count": 5,
"stdev_across_iters": 0.3623034806308045
},
"gemma-7b-it": {
"mean_iter_score": 4.556976005828165,
"iteration_count": 5,
"stdev_across_iters": 0.23300509420449106
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 4.814644059033656,
"iteration_count": 5,
"stdev_across_iters": 0.3005825383708583
},
"databricks/dbrx-instruct": {
"mean_iter_score": 4.683821430250049,
"iteration_count": 5,
"stdev_across_iters": 0.3011715649562168
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.1764705882352941,
"p_value": 0.34884640290128766
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": -0.19117647058823525,
"p_value": 0.30811454449876985
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.0,
"p_value": 1.0
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.4411764705882352,
"p_value": 0.013367514323499561
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": -0.044117647058823525,
"p_value": 0.8393415533036079
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.08823529411764705,
"p_value": 0.6553075802476113
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.20588235294117643,
"p_value": 0.27056053596407176
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.338235294117647,
"p_value": 0.06302241248726353
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.07352941176470587,
"p_value": 0.7150317752938318
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.1764705882352941,
"p_value": 0.34884640290128766
}
},
"average_kendall_tau": 0.1264705882352941
},
"randomized_average_kendall_tau_by_item": 0.057597058823529404
}
},
"raw_score_range": 0.26883333333333326,
"final_judgemark_score_raw": 0.15058068998029084,
"final_judgemark_score_elements_raw": {
"norm_stability_between_iterations": 0.0,
"norm_correlation_with_lmsys_arena": 0.288173681842026,
"norm_std_dev_between_models": 0.03345328525020907,
"norm_kruskall_wallis": 0.012511212109797415,
"norm_ci99_adjacent_overlap": 0.7416739635937193,
"norm_score_range": 0.03360416666666666,
"norm_intra_model_ci95": 0.07196631924310755,
"norm_earth_movers_distance": 0.029756740196078435
},
"final_judgemark_score_elements_calibrated": {
"norm_stability_between_iterations": 0.0,
"norm_correlation_with_lmsys_arena": 0.17320261437908493,
"norm_std_dev_between_models": 0.09859781023654902,
"norm_kruskall_wallis": 0.012511212109797415,
"norm_ci99_adjacent_overlap": 0.09409548926636191,
"norm_score_range": 0.10725777137384851,
"norm_intra_model_ci95": 0.012413331631055858,
"norm_earth_movers_distance": {
"pearson_r": 0.0,
"kendall_tau": 0.17320261437908493,
"anova_f": 0.003599021478014157,
"kw_stat": 0.012511212109797415,
"std_dev": 0.09859781023654902,
"ci99_overlap_magnitude_sum_norm": 0.09409548926636191,
"calibrated_score_range_norm": 0.10725777137384851,
"kendall_tau_bootstrapped": 0.0
}
}
}