Files
Judgemark-v2lp/results/stats/deepseek-ai__deepseek-r1.json
T
2025-01-31 18:03:33 +11:00

22630 lines
1.6 MiB
Plaintext

{
"judge_model": "deepseek/deepseek-r1",
"start_time": "2025-01-29T15:56:08.738485",
"status": "completed",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"end_time": "2025-01-31T15:23:15.576135",
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"calibration_config": {
"method": "piecewise_landmark",
"in_landmarks": [
2.26,
4.96,
5.875,
7.07,
9.43
],
"out_landmarks": [
0,
3,
5,
7,
10
]
},
"calibrated_score_distribution": {
"count": 2040,
"min": 0.0,
"max": 10.0,
"mean": 5.016,
"median": 4.996,
"stdev": 2.204,
"p10": 2.178,
"p25": 3.0,
"p75": 7.0,
"p90": 8.004
},
"raw_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 7.181666666666667,
"median": 7.29,
"stdev": 0.7121364864617877,
"ci95": 0.12741738443481387,
"min": 4.86,
"max": 9.21,
"length_correlation": -0.12476126162522726
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 5.9225,
"median": 5.96,
"stdev": 0.7522719090927721,
"ci95": 0.13459852270261174,
"min": 4.04,
"max": 7.39,
"length_correlation": -0.08517519820317922
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 6.5328333333333335,
"median": 6.51,
"stdev": 0.9703358019919965,
"ci95": 0.17361510365458838,
"min": 3.96,
"max": 9.11,
"length_correlation": 0.024977861772695002
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 7.131083333333334,
"median": 7.18,
"stdev": 0.6812196129475616,
"ci95": 0.12188565388459714,
"min": 5.57,
"max": 9.04,
"length_correlation": -0.21650898706232274
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 6.006833333333334,
"median": 5.91,
"stdev": 0.7716913469866081,
"ci95": 0.13807310100419642,
"min": 4.07,
"max": 8.25,
"length_correlation": -0.28823478256326757
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 5.3620833333333335,
"median": 5.305,
"stdev": 0.795310727113035,
"ci95": 0.1422991443187773,
"min": 3.56,
"max": 7.75,
"length_correlation": -0.35908460420870325
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 4.867166666666667,
"median": 4.91,
"stdev": 0.7396696779596938,
"ci95": 0.13234369745556132,
"min": 3.0,
"max": 6.5,
"length_correlation": 0.21475595131056402
},
"gemma-7b-it": {
"count": 120,
"mean": 4.516166666666667,
"median": 4.645,
"stdev": 0.8357567730826364,
"ci95": 0.14953586015366155,
"min": 2.46,
"max": 6.18,
"length_correlation": -0.03934594071556713
},
"gemma-2b-it": {
"count": 120,
"mean": 4.2244166666666665,
"median": 4.21,
"stdev": 0.7734782249202432,
"ci95": 0.13839281403243808,
"min": 2.26,
"max": 6.36,
"length_correlation": 0.0025304787380251363
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 5.6025,
"median": 5.695,
"stdev": 0.8694712485763059,
"ci95": 0.15556814520949194,
"min": 3.36,
"max": 7.29,
"length_correlation": -0.05954676591781328
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 5.40125,
"median": 5.32,
"stdev": 0.708126374445179,
"ci95": 0.12669988435700497,
"min": 2.71,
"max": 7.21,
"length_correlation": 0.06292781180795073
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 7.180833333333333,
"median": 7.275,
"stdev": 0.6776548579052445,
"ci95": 0.12124783828003574,
"min": 4.64,
"max": 8.39,
"length_correlation": -0.258803259514821
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 5.8375,
"median": 5.75,
"stdev": 1.1288994045291274,
"ci95": 0.20198573187815255,
"min": 3.07,
"max": 8.82,
"length_correlation": -0.1768188568922991
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 7.623833333333334,
"median": 7.66,
"stdev": 0.6280598153461667,
"ci95": 0.11237415925369076,
"min": 5.93,
"max": 9.11,
"length_correlation": 0.09866373846682355
},
"DeepSeek-R1": {
"count": 120,
"mean": 8.144583333333333,
"median": 8.14,
"stdev": 0.6445634257417353,
"ci95": 0.11532702982037446,
"min": 5.93,
"max": 9.43,
"length_correlation": 0.223446800463047
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 4.980166666666666,
"median": 5.04,
"stdev": 0.6795438204572011,
"ci95": 0.1215858165640322,
"min": 3.11,
"max": 7.61,
"length_correlation": -0.17926892838153338
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 5.103833333333333,
"median": 5.07,
"stdev": 0.9138579991317201,
"ci95": 0.16350994255712,
"min": 2.64,
"max": 8.0,
"length_correlation": -0.4113509255472687
}
},
"calibrated_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 7.038882061732457,
"median": 7.279661016949152,
"stdev": 1.0944594980631983,
"ci95": 0.19582365075256514,
"min": 2.8888888888888893,
"max": 9.720338983050848,
"length_correlation": -0.11799301330274381
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 4.970570479602721,
"median": 5.142259414225942,
"stdev": 1.3644509554360547,
"ci95": 0.2441312518545891,
"min": 1.9777777777777779,
"max": 7.406779661016948,
"length_correlation": -0.08214935451522755
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 5.97477187604026,
"median": 6.06276150627615,
"stdev": 1.5843047749261256,
"ci95": 0.28346809130879347,
"min": 1.8888888888888888,
"max": 9.59322033898305,
"length_correlation": 0.03591482605222406
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 6.976388965586416,
"median": 7.139830508474575,
"stdev": 1.018702753773122,
"ci95": 0.18226904936049548,
"min": 4.333333333333334,
"max": 9.504237288135592,
"length_correlation": -0.2040023634235643
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 5.097079540223456,
"median": 5.05857740585774,
"stdev": 1.3579300333045548,
"ci95": 0.24296451084651774,
"min": 2.0111111111111115,
"max": 8.5,
"length_correlation": -0.26130862472962985
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 3.9545145951251075,
"median": 3.754098360655738,
"stdev": 1.3915146338781532,
"ci95": 0.24897355833071158,
"min": 1.4444444444444446,
"max": 7.864406779661017,
"length_correlation": -0.3292350373920184
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 3.16056157004113,
"median": 2.944444444444444,
"stdev": 1.1470044146364253,
"ci95": 0.20522512920843042,
"min": 0.8222222222222224,
"max": 6.04602510460251,
"length_correlation": 0.21187387552721762
},
"gemma-7b-it": {
"count": 120,
"mean": 2.664260472318124,
"median": 2.6500000000000004,
"stdev": 1.1663089835688045,
"ci95": 0.20867915484504174,
"min": 0.22222222222222243,
"max": 5.510460251046025,
"length_correlation": -0.024225342784098276
},
"gemma-2b-it": {
"count": 120,
"mean": 2.2565586790233545,
"median": 2.1666666666666665,
"stdev": 1.0168478224867272,
"ci95": 0.18193716004248986,
"min": 0.0,
"max": 5.811715481171548,
"length_correlation": 0.005129635119545636
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 4.433442385376863,
"median": 4.60655737704918,
"stdev": 1.5076105334040426,
"ci95": 0.26974575037874465,
"min": 1.2222222222222223,
"max": 7.279661016949152,
"length_correlation": -0.07285885938620999
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 4.021473968783708,
"median": 3.78688524590164,
"stdev": 1.2656064204512008,
"ci95": 0.2264457205654668,
"min": 0.5000000000000001,
"max": 7.177966101694915,
"length_correlation": 0.0434394866959032
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 7.043768309502954,
"median": 7.260593220338983,
"stdev": 1.0372587099686337,
"ci95": 0.18558913118338627,
"min": 2.6444444444444444,
"max": 8.677966101694917,
"length_correlation": -0.22852472300266763
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 4.809740521821776,
"median": 4.726775956284153,
"stdev": 1.882541347610382,
"ci95": 0.3368293847639776,
"min": 0.8999999999999999,
"max": 9.224576271186441,
"length_correlation": -0.1471370410607534
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 7.675283520081318,
"median": 7.75,
"stdev": 0.8566572993642598,
"ci95": 0.1532754387916653,
"min": 5.092050209205021,
"max": 9.59322033898305,
"length_correlation": 0.07998702965293368
},
"DeepSeek-R1": {
"count": 120,
"mean": 8.35489474741744,
"median": 8.360169491525424,
"stdev": 0.8538147495851818,
"ci95": 0.15276684210428673,
"min": 5.092050209205021,
"max": 10.0,
"length_correlation": 0.21793652891136667
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 3.2934786570102488,
"median": 3.1748633879781423,
"stdev": 1.085699226625789,
"ci95": 0.19425623931569364,
"min": 0.9444444444444444,
"max": 7.686440677966102,
"length_correlation": -0.18820971899924424
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 3.5512556224401055,
"median": 3.240437158469946,
"stdev": 1.4656367020929375,
"ci95": 0.2622356790623018,
"min": 0.4222222222222226,
"max": 8.182203389830509,
"length_correlation": -0.3705867166259409
}
},
"raw_cross_model_stats": {
"anova_f": 251.66485889219211,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.1119235407166477,
"pearson_r": 0.962056318930382,
"kendall_tau": 0.8970588235294118,
"normalized_components": {
"pearson_r": 0.8735210631012734,
"kendall_tau": 0.8856209150326798,
"anova_f": 0.7190424539776917,
"kw_stat": 0.9047547420925153,
"std_dev": 0.5054197912348398,
"ci99_overlap_magnitude_sum_norm": 0.8085561238366368,
"raw_score_range_norm": 0.49002083333333335,
"kendall_tau_bootstrapped": 0.8641225490196077
}
},
"calibrated_cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9580374221913868,
"kendall_tau": 0.8882352941176469,
"normalized_components": {
"pearson_r": 0.860124740637956,
"kendall_tau": 0.8758169934640521,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079,
"ci99_overlap_magnitude_sum_norm": 0.6828356351013829,
"calibrated_score_range_norm": 0.7622920085492606,
"kendall_tau_bootstrapped": 0.8631715686274509
}
},
"separability_metrics": {
"raw": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__claude-3-5-sonnet-20240620": false,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": true,
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__claude-3-5-sonnet-20240620": 0.030533908570126833,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.4780312267862339,
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.42953855137130414,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.08843031464079942,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.4531836757918626,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5306675800894585,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.46984522005092444,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.3551843841931559,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.4911105634526738,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.3445908218876017,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.4383420149622479,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.38757072697520467,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.20466859022543638,
"gemma-7b-it__gemma-2b-it": 0.27584320125041195
},
"ci99_overlap_magnitude_sum": 4.977540780247442,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31534751220512697,
"emd": {
"average": 1.350427696078431,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 1.2591666666666668,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.6633333333333333,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.10225000000000005,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.1748333333333332,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 1.8195833333333333,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 2.3145000000000002,
"claude-3-5-sonnet-20240620__gemma-7b-it": 2.6654999999999998,
"claude-3-5-sonnet-20240620__gemma-2b-it": 2.95725,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 1.5791666666666666,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 1.7804166666666665,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.08166666666666669,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 1.3441666666666667,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.4438333333333334,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.9629166666666665,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 2.2015000000000002,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 2.0778333333333334,
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.6116666666666666,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 1.2085833333333333,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.12133333333333338,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.5664166666666666,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.0553333333333332,
"claude-3-haiku-20240307__gemma-7b-it": 1.406333333333333,
"claude-3-haiku-20240307__gemma-2b-it": 1.6980833333333332,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.31999999999999995,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.52125,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 1.2583333333333333,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.3325000000000001,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 1.7013333333333334,
"claude-3-haiku-20240307__DeepSeek-R1": 2.2220833333333334,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.9460000000000001,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.8371666666666666,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.6310833333333333,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.5289999999999999,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 1.17075,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 1.6656666666666669,
"claude-3-opus-20240229__gemma-7b-it": 2.0166666666666666,
"claude-3-opus-20240229__gemma-2b-it": 2.308416666666667,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.9303333333333332,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.1315833333333334,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.6955,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.6953333333333334,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.0910000000000002,
"claude-3-opus-20240229__DeepSeek-R1": 1.6117499999999998,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 1.5526666666666669,
"claude-3-opus-20240229__databricks/dbrx-instruct": 1.4289999999999998,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.1247500000000001,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 1.769,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 2.263916666666667,
"gemini-1.5-pro-001__gemma-7b-it": 2.614916666666667,
"gemini-1.5-pro-001__gemma-2b-it": 2.9066666666666667,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 1.5285833333333332,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 1.7298333333333331,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.10824999999999996,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 1.2935833333333333,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.49275,
"gemini-1.5-pro-001__DeepSeek-R1": 1.0135,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 2.1509166666666664,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 2.02725,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.64475,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.1396666666666668,
"Llama-3-70b-chat-hf__gemma-7b-it": 1.4906666666666668,
"Llama-3-70b-chat-hf__gemma-2b-it": 1.7824166666666668,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.4043333333333333,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.6055833333333334,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.174,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.3283333333333333,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 1.617,
"Llama-3-70b-chat-hf__DeepSeek-R1": 2.1377500000000005,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.0266666666666666,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.903,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.49491666666666667,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.8459166666666667,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.1376666666666666,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.26641666666666663,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.1195,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 1.81875,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.5075833333333333,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 2.26175,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 2.7824999999999998,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.3825833333333334,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.27125,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.35099999999999987,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.6427500000000002,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.7353333333333334,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.5389166666666667,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 2.3136666666666668,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 0.9703333333333335,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 2.756666666666667,
"Llama-2-13b-chat-hf__DeepSeek-R1": 3.2774166666666664,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.1461666666666666,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.25950000000000006,
"gemma-7b-it__gemma-2b-it": 0.3039166666666667,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.0863333333333334,
"gemma-7b-it__c4ai-command-r-08-2024": 0.8850833333333333,
"gemma-7b-it__gemini-1.5-pro-002": 2.6646666666666667,
"gemma-7b-it__Mistral-Large-Instruct-2411": 1.3213333333333335,
"gemma-7b-it__gpt-4o-2024-11-20": 3.107666666666667,
"gemma-7b-it__DeepSeek-R1": 3.6284166666666664,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.46399999999999997,
"gemma-7b-it__databricks/dbrx-instruct": 0.5876666666666666,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 1.3780833333333333,
"gemma-2b-it__c4ai-command-r-08-2024": 1.1768333333333336,
"gemma-2b-it__gemini-1.5-pro-002": 2.9564166666666667,
"gemma-2b-it__Mistral-Large-Instruct-2411": 1.613083333333333,
"gemma-2b-it__gpt-4o-2024-11-20": 3.399416666666667,
"gemma-2b-it__DeepSeek-R1": 3.920166666666666,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.75575,
"gemma-2b-it__databricks/dbrx-instruct": 0.8794166666666667,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.27641666666666664,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.5783333333333334,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.26766666666666666,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 2.021333333333333,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 2.5420833333333337,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6293333333333333,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.5205000000000001,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 1.7795833333333335,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.5175833333333334,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 2.222583333333333,
"c4ai-command-r-08-2024__DeepSeek-R1": 2.743333333333333,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.4344166666666666,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.3330833333333334,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 1.3504999999999998,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.443,
"gemini-1.5-pro-002__DeepSeek-R1": 0.9637499999999999,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 2.2006666666666668,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 2.077,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 1.7863333333333338,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 2.307083333333333,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 0.8615000000000002,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 0.7336666666666667,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.5207499999999998,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 2.6436666666666664,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 2.5199999999999996,
"DeepSeek-R1__gpt-3.5-turbo-0125": 3.1644166666666664,
"DeepSeek-R1__databricks/dbrx-instruct": 3.04075,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.19883333333333336
}
},
"average_ci95": 0.13979175468006755,
"modulated_ci95": 0.7883687805128173
},
"calibrated": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
"calibrated_score_range": 6.098336068394085,
"final_judgemark_score": 0.7697031560468162,
"iteration_stability": {
"raw": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 7.181666666666667,
"iteration_count": 5,
"stdev_across_iters": 0.11439423790850059
},
"claude-3-haiku-20240307": {
"mean_iter_score": 5.9225,
"iteration_count": 5,
"stdev_across_iters": 0.1169000641573821
},
"claude-3-opus-20240229": {
"mean_iter_score": 6.5328333333333335,
"iteration_count": 5,
"stdev_across_iters": 0.18868595307782954
},
"gemini-1.5-pro-001": {
"mean_iter_score": 7.131083333333333,
"iteration_count": 5,
"stdev_across_iters": 0.08383871752093754
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 6.006833333333334,
"iteration_count": 5,
"stdev_across_iters": 0.07510760798865698
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 5.3620833333333335,
"iteration_count": 5,
"stdev_across_iters": 0.1018410389883283
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 4.867166666666667,
"iteration_count": 5,
"stdev_across_iters": 0.085162181878004
},
"gemma-7b-it": {
"mean_iter_score": 4.516166666666667,
"iteration_count": 5,
"stdev_across_iters": 0.1272228469174376
},
"gemma-2b-it": {
"mean_iter_score": 4.2244166666666665,
"iteration_count": 5,
"stdev_across_iters": 0.05735404180274562
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 5.6025,
"iteration_count": 5,
"stdev_across_iters": 0.14790340316120762
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 5.40125,
"iteration_count": 5,
"stdev_across_iters": 0.052260698001036866
},
"gemini-1.5-pro-002": {
"mean_iter_score": 7.180833333333333,
"iteration_count": 5,
"stdev_across_iters": 0.08822516962610806
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 5.8375,
"iteration_count": 5,
"stdev_across_iters": 0.09184981703241937
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 7.623833333333334,
"iteration_count": 5,
"stdev_across_iters": 0.08676852988139051
},
"DeepSeek-R1": {
"mean_iter_score": 8.144583333333333,
"iteration_count": 5,
"stdev_across_iters": 0.20265272018899674
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 4.980166666666666,
"iteration_count": 5,
"stdev_across_iters": 0.1592102034697803
},
"databricks/dbrx-instruct": {
"mean_iter_score": 5.103833333333333,
"iteration_count": 5,
"stdev_across_iters": 0.15379739558551966
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9705882352941175,
"p_value": 8.546830053210383e-13
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8823529411764705,
"p_value": 3.5743855407137387e-09
}
},
"average_kendall_tau": 0.9323529411764705
},
"randomized_average_kendall_tau_by_item": 0.9184735294117646
},
"calibrated": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 7.038882061732457,
"iteration_count": 5,
"stdev_across_iters": 0.15783950115828302
},
"claude-3-haiku-20240307": {
"mean_iter_score": 4.97057047960272,
"iteration_count": 5,
"stdev_across_iters": 0.21362852261029117
},
"claude-3-opus-20240229": {
"mean_iter_score": 5.97477187604026,
"iteration_count": 5,
"stdev_across_iters": 0.31455229632227294
},
"gemini-1.5-pro-001": {
"mean_iter_score": 6.976388965586416,
"iteration_count": 5,
"stdev_across_iters": 0.13161711482782049
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 5.097079540223456,
"iteration_count": 5,
"stdev_across_iters": 0.1365404900361279
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 3.9545145951251075,
"iteration_count": 5,
"stdev_across_iters": 0.17923578285084113
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 3.16056157004113,
"iteration_count": 5,
"stdev_across_iters": 0.1468416485120612
},
"gemma-7b-it": {
"mean_iter_score": 2.664260472318124,
"iteration_count": 5,
"stdev_across_iters": 0.16993019534974582
},
"gemma-2b-it": {
"mean_iter_score": 2.2565586790233545,
"iteration_count": 5,
"stdev_across_iters": 0.11080727566776136
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 4.433442385376863,
"iteration_count": 5,
"stdev_across_iters": 0.25279458782241115
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 4.021473968783708,
"iteration_count": 5,
"stdev_across_iters": 0.09669016809797659
},
"gemini-1.5-pro-002": {
"mean_iter_score": 7.043768309502954,
"iteration_count": 5,
"stdev_across_iters": 0.13622065843714878
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 4.809740521821776,
"iteration_count": 5,
"stdev_across_iters": 0.16393733480034264
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 7.675283520081318,
"iteration_count": 5,
"stdev_across_iters": 0.12985108049520988
},
"DeepSeek-R1": {
"mean_iter_score": 8.35489474741744,
"iteration_count": 5,
"stdev_across_iters": 0.268310536220573
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 3.2934786570102488,
"iteration_count": 5,
"stdev_across_iters": 0.2658041166397857
},
"databricks/dbrx-instruct": {
"mean_iter_score": 3.5512556224401055,
"iteration_count": 5,
"stdev_across_iters": 0.28256941305776895
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8676470588235293,
"p_value": 9.575975226992579e-09
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8823529411764705,
"p_value": 3.5743855407137387e-09
}
},
"average_kendall_tau": 0.9176470588235294
},
"randomized_average_kendall_tau_by_item": 0.9179029411764705
}
},
"ephemeral_runs": [
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9581152460205015,
"kendall_tau": 0.8705882352941176,
"normalized_components": {
"pearson_r": 0.8603841534016716,
"kendall_tau": 0.8562091503267972,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9586483685892335,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8621612286307785,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9582986178372387,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8609953927907956,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9581454124025393,
"kendall_tau": 0.8647058823529411,
"normalized_components": {
"pearson_r": 0.8604847080084642,
"kendall_tau": 0.849673202614379,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504326,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9585942833032348,
"kendall_tau": 0.8794117647058822,
"normalized_components": {
"pearson_r": 0.8619809443441161,
"kendall_tau": 0.8660130718954246,
"anova_f": 0.7369479101001236,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350438,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.957889580302259,
"kendall_tau": 0.8735294117647058,
"normalized_components": {
"pearson_r": 0.8596319343408634,
"kendall_tau": 0.8594771241830064,
"anova_f": 0.7369479101001251,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9580092679939227,
"kendall_tau": 0.9029411764705882,
"normalized_components": {
"pearson_r": 0.8600308933130758,
"kendall_tau": 0.8921568627450981,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9587213678567964,
"kendall_tau": 0.9029411764705881,
"normalized_components": {
"pearson_r": 0.8624045595226547,
"kendall_tau": 0.892156862745098,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9573541486270019,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8578471620900062,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9559481957127732,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8531606523759108,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.958703626076749,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8623454202558299,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.957393182748313,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.85797727582771,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9566317943571925,
"kendall_tau": 0.8705882352941176,
"normalized_components": {
"pearson_r": 0.8554393145239749,
"kendall_tau": 0.8562091503267972,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504326,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9572258694596579,
"kendall_tau": 0.8764705882352941,
"normalized_components": {
"pearson_r": 0.8574195648655265,
"kendall_tau": 0.8627450980392157,
"anova_f": 0.7369479101001236,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9550029991897743,
"kendall_tau": 0.8794117647058822,
"normalized_components": {
"pearson_r": 0.8500099972992476,
"kendall_tau": 0.8660130718954246,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9572468384461269,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8574894614870897,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9562493164391831,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8541643881306104,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9578106378411286,
"kendall_tau": 0.8999999999999999,
"normalized_components": {
"pearson_r": 0.8593687928037621,
"kendall_tau": 0.8888888888888888,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9558424772370746,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8528082574569154,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9571608796859989,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8572029322866632,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.957892252800124,
"kendall_tau": 0.8970588235294118,
"normalized_components": {
"pearson_r": 0.8596408426670801,
"kendall_tau": 0.8856209150326798,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350438,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9581030293756713,
"kendall_tau": 0.8794117647058823,
"normalized_components": {
"pearson_r": 0.8603434312522376,
"kendall_tau": 0.8660130718954249,
"anova_f": 0.7369479101001251,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9589978656186107,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8633262187287023,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350435,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9581285738686223,
"kendall_tau": 0.8882352941176471,
"normalized_components": {
"pearson_r": 0.8604285795620743,
"kendall_tau": 0.8758169934640524,
"anova_f": 0.7369479101001243,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504366,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9586364974274908,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.862121658091636,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001247,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9577914569496006,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8593048564986685,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9573575712282261,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8578585707607536,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9574915238164328,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8583050793881093,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9574577120574923,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8581923735249744,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9584377525136883,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8614591750456279,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9583037480685486,
"kendall_tau": 0.8823529411764705,
"normalized_components": {
"pearson_r": 0.8610124935618286,
"kendall_tau": 0.8692810457516338,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9588697553977031,
"kendall_tau": 0.8794117647058823,
"normalized_components": {
"pearson_r": 0.8628991846590105,
"kendall_tau": 0.8660130718954249,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504326,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9576467511709901,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8588225039033003,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001236,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9573568477788098,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8578561592626994,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9586919151966741,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8623063839889136,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9571512314671816,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.8571707715572721,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350435,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.957890135357559,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8596337845251966,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001243,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9581110583618601,
"kendall_tau": 0.8823529411764705,
"normalized_components": {
"pearson_r": 0.8603701945395336,
"kendall_tau": 0.8692810457516338,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9572716342948216,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.8575721143160722,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9573583592970925,
"kendall_tau": 0.8764705882352941,
"normalized_components": {
"pearson_r": 0.8578611976569749,
"kendall_tau": 0.8627450980392157,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9584861226881956,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8616204089606518,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350435,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9582347125705659,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.8607823752352197,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.7369479101001243,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9578527359738251,
"kendall_tau": 0.8852941176470587,
"normalized_components": {
"pearson_r": 0.8595091199127505,
"kendall_tau": 0.8725490196078429,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350435,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9564358105756929,
"kendall_tau": 0.8794117647058823,
"normalized_components": {
"pearson_r": 0.8547860352523098,
"kendall_tau": 0.8660130718954249,
"anova_f": 0.7369479101001243,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9590741978422709,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.8635806594742363,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9579700683686015,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8599002278953385,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.958821636556474,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.86273878852158,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9553747194651808,
"kendall_tau": 0.8705882352941176,
"normalized_components": {
"pearson_r": 0.8512490648839361,
"kendall_tau": 0.8562091503267972,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9570688504216035,
"kendall_tau": 0.8705882352941176,
"normalized_components": {
"pearson_r": 0.8568961680720117,
"kendall_tau": 0.8562091503267972,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9577320550146401,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8591068500488004,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9585426524551258,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8618088415170861,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9591466415899605,
"kendall_tau": 0.8999999999999999,
"normalized_components": {
"pearson_r": 0.8638221386332019,
"kendall_tau": 0.8888888888888888,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9594158898559625,
"kendall_tau": 0.8911764705882353,
"normalized_components": {
"pearson_r": 0.8647196328532083,
"kendall_tau": 0.8790849673202615,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9581027688121884,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8603425627072946,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9583688172006113,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8612293906687043,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9585047519676084,
"kendall_tau": 0.8999999999999999,
"normalized_components": {
"pearson_r": 0.8616825065586948,
"kendall_tau": 0.8888888888888888,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350433,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9565449754106187,
"kendall_tau": 0.8794117647058823,
"normalized_components": {
"pearson_r": 0.8551499180353956,
"kendall_tau": 0.8660130718954249,
"anova_f": 0.7369479101001237,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9580424598549911,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8601415328499705,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9559752545009476,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.8532508483364919,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9576981651354667,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.858993883784889,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9580487486859879,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.8601624956199596,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9584774799106441,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8615915997021472,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9580288323727177,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8600961079090592,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9562277436399984,
"kendall_tau": 0.876470588235294,
"normalized_components": {
"pearson_r": 0.8540924787999947,
"kendall_tau": 0.8627450980392155,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9574924086889268,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8583080289630894,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350438,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9591292415513666,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8637641385045554,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001251,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9584072047425742,
"kendall_tau": 0.876470588235294,
"normalized_components": {
"pearson_r": 0.8613573491419142,
"kendall_tau": 0.8627450980392155,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9578742075117271,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8595806917057571,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9587425423494627,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.8624751411648756,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504326,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.95642156448442,
"kendall_tau": 0.8705882352941176,
"normalized_components": {
"pearson_r": 0.8547385482813998,
"kendall_tau": 0.8562091503267972,
"anova_f": 0.7369479101001236,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9582522716397858,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8608409054659526,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9583558726146496,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8611862420488321,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9571224736088342,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.8570749120294473,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9585552010120911,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8618506700403038,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9576843987919221,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8589479959730738,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9578855356706557,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.859618452235519,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350435,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9585612579991936,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8618708599973119,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001243,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9562328952873171,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.8541096509577238,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9583849573526105,
"kendall_tau": 0.8911764705882352,
"normalized_components": {
"pearson_r": 0.8612831911753684,
"kendall_tau": 0.8790849673202614,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9571905327595688,
"kendall_tau": 0.8794117647058823,
"normalized_components": {
"pearson_r": 0.8573017758652293,
"kendall_tau": 0.8660130718954249,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9583702780898697,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8612342602995658,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9558898750902126,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8529662503007087,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9579360475647603,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.8597868252158676,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350435,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9568980856092495,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.8563269520308316,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001243,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.958527027458611,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8617567581953699,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9582987588594254,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.8609958628647514,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.956971065074383,
"kendall_tau": 0.9,
"normalized_components": {
"pearson_r": 0.8565702169146101,
"kendall_tau": 0.888888888888889,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9586342311476586,
"kendall_tau": 0.8999999999999999,
"normalized_components": {
"pearson_r": 0.8621141038255286,
"kendall_tau": 0.8888888888888888,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9596303541330921,
"kendall_tau": 0.8852941176470587,
"normalized_components": {
"pearson_r": 0.8654345137769736,
"kendall_tau": 0.8725490196078429,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9574098132309395,
"kendall_tau": 0.8823529411764705,
"normalized_components": {
"pearson_r": 0.8580327107697985,
"kendall_tau": 0.8692810457516338,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9579034487768185,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8596781625893952,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9581880662958705,
"kendall_tau": 0.8823529411764706,
"normalized_components": {
"pearson_r": 0.8606268876529017,
"kendall_tau": 0.869281045751634,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9579999142537933,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.859999714179311,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.958523649628437,
"kendall_tau": 0.8794117647058822,
"normalized_components": {
"pearson_r": 0.8617454987614566,
"kendall_tau": 0.8660130718954246,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9568818218159676,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8562727393865587,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504343,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9575438033611618,
"kendall_tau": 0.8764705882352941,
"normalized_components": {
"pearson_r": 0.8584793445372062,
"kendall_tau": 0.8627450980392157,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350436,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9577160197532969,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8590533991776563,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001246,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9562334967784376,
"kendall_tau": 0.8941176470588235,
"normalized_components": {
"pearson_r": 0.8541116559281254,
"kendall_tau": 0.8823529411764706,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.93176853504355,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9592307950995467,
"kendall_tau": 0.888235294117647,
"normalized_components": {
"pearson_r": 0.8641026503318223,
"kendall_tau": 0.8758169934640523,
"anova_f": 0.7369479101001244,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
},
{
"raw_score_distribution": {
"count": 2040,
"min": 2.26,
"max": 9.43,
"mean": 5.978,
"median": 5.875,
"stdev": 1.363,
"p10": 4.22,
"p25": 4.96,
"p75": 7.07,
"p90": 7.86
},
"cross_model_stats": {
"anova_f": 257.9317685350435,
"anova_p": 0.0,
"kw_stat": 1357.1321131387729,
"kw_p": 2.665121927943416e-279,
"std_dev_across_models": 1.8051081167539373,
"pearson_r": 0.9572482755777107,
"kendall_tau": 0.8852941176470588,
"normalized_components": {
"pearson_r": 0.8574942519257025,
"kendall_tau": 0.8725490196078431,
"anova_f": 0.7369479101001243,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079
}
},
"separability_metrics": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
},
"ci99_overlap_magnitude_sum": 8.246273487364045,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.31566886270330186,
"emd": {
"average": 2.195135707105006,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
}
},
"average_ci95": 0.22203622015971514,
"modulated_ci95": 0.5996755593310718
}
}
],
"raw_score_range": 3.920166666666667,
"final_judgemark_score_raw": 0.7177047101231313,
"final_judgemark_score_elements_raw": {
"norm_stability_between_iterations": 0.8641225490196077,
"norm_correlation_with_lmsys_arena": 0.8856209150326798,
"norm_std_dev_between_models": 0.5054197912348398,
"norm_kruskall_wallis": 0.9047547420925153,
"norm_ci99_adjacent_overlap": 0.8085561238366368,
"norm_score_range": 0.49002083333333335,
"norm_intra_model_ci95": 0.7883687805128173,
"norm_earth_movers_distance": 0.33760692401960773
},
"final_judgemark_score_elements_calibrated": {
"norm_stability_between_iterations": 0.8631715686274509,
"norm_correlation_with_lmsys_arena": 0.8758169934640521,
"norm_std_dev_between_models": 0.8205036894336079,
"norm_kruskall_wallis": 0.9047547420925153,
"norm_ci99_adjacent_overlap": 0.6828356351013829,
"norm_score_range": 0.7622920085492606,
"norm_intra_model_ci95": 0.5996755593310718,
"norm_earth_movers_distance": {
"pearson_r": 0.860124740637956,
"kendall_tau": 0.8758169934640521,
"anova_f": 0.7369479101001241,
"kw_stat": 0.9047547420925153,
"std_dev": 0.8205036894336079,
"ci99_overlap_magnitude_sum_norm": 0.6828356351013829,
"calibrated_score_range_norm": 0.7622920085492606,
"kendall_tau_bootstrapped": 0.8631715686274509
}
}
}