mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
22630 lines
1.6 MiB
Plaintext
22630 lines
1.6 MiB
Plaintext
{
|
|
"judge_model": "deepseek/deepseek-r1",
|
|
"start_time": "2025-01-29T15:56:08.738485",
|
|
"status": "completed",
|
|
"samples_file": "data/judgemark_v2.1_samples.json",
|
|
"prompts_file": "data/judge_prompts.json",
|
|
"end_time": "2025-01-31T15:23:15.576135",
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"calibration_config": {
|
|
"method": "piecewise_landmark",
|
|
"in_landmarks": [
|
|
2.26,
|
|
4.96,
|
|
5.875,
|
|
7.07,
|
|
9.43
|
|
],
|
|
"out_landmarks": [
|
|
0,
|
|
3,
|
|
5,
|
|
7,
|
|
10
|
|
]
|
|
},
|
|
"calibrated_score_distribution": {
|
|
"count": 2040,
|
|
"min": 0.0,
|
|
"max": 10.0,
|
|
"mean": 5.016,
|
|
"median": 4.996,
|
|
"stdev": 2.204,
|
|
"p10": 2.178,
|
|
"p25": 3.0,
|
|
"p75": 7.0,
|
|
"p90": 8.004
|
|
},
|
|
"raw_model_stats": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"count": 120,
|
|
"mean": 7.181666666666667,
|
|
"median": 7.29,
|
|
"stdev": 0.7121364864617877,
|
|
"ci95": 0.12741738443481387,
|
|
"min": 4.86,
|
|
"max": 9.21,
|
|
"length_correlation": -0.12476126162522726
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"count": 120,
|
|
"mean": 5.9225,
|
|
"median": 5.96,
|
|
"stdev": 0.7522719090927721,
|
|
"ci95": 0.13459852270261174,
|
|
"min": 4.04,
|
|
"max": 7.39,
|
|
"length_correlation": -0.08517519820317922
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"count": 120,
|
|
"mean": 6.5328333333333335,
|
|
"median": 6.51,
|
|
"stdev": 0.9703358019919965,
|
|
"ci95": 0.17361510365458838,
|
|
"min": 3.96,
|
|
"max": 9.11,
|
|
"length_correlation": 0.024977861772695002
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"count": 120,
|
|
"mean": 7.131083333333334,
|
|
"median": 7.18,
|
|
"stdev": 0.6812196129475616,
|
|
"ci95": 0.12188565388459714,
|
|
"min": 5.57,
|
|
"max": 9.04,
|
|
"length_correlation": -0.21650898706232274
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 6.006833333333334,
|
|
"median": 5.91,
|
|
"stdev": 0.7716913469866081,
|
|
"ci95": 0.13807310100419642,
|
|
"min": 4.07,
|
|
"max": 8.25,
|
|
"length_correlation": -0.28823478256326757
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 5.3620833333333335,
|
|
"median": 5.305,
|
|
"stdev": 0.795310727113035,
|
|
"ci95": 0.1422991443187773,
|
|
"min": 3.56,
|
|
"max": 7.75,
|
|
"length_correlation": -0.35908460420870325
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 4.867166666666667,
|
|
"median": 4.91,
|
|
"stdev": 0.7396696779596938,
|
|
"ci95": 0.13234369745556132,
|
|
"min": 3.0,
|
|
"max": 6.5,
|
|
"length_correlation": 0.21475595131056402
|
|
},
|
|
"gemma-7b-it": {
|
|
"count": 120,
|
|
"mean": 4.516166666666667,
|
|
"median": 4.645,
|
|
"stdev": 0.8357567730826364,
|
|
"ci95": 0.14953586015366155,
|
|
"min": 2.46,
|
|
"max": 6.18,
|
|
"length_correlation": -0.03934594071556713
|
|
},
|
|
"gemma-2b-it": {
|
|
"count": 120,
|
|
"mean": 4.2244166666666665,
|
|
"median": 4.21,
|
|
"stdev": 0.7734782249202432,
|
|
"ci95": 0.13839281403243808,
|
|
"min": 2.26,
|
|
"max": 6.36,
|
|
"length_correlation": 0.0025304787380251363
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 5.6025,
|
|
"median": 5.695,
|
|
"stdev": 0.8694712485763059,
|
|
"ci95": 0.15556814520949194,
|
|
"min": 3.36,
|
|
"max": 7.29,
|
|
"length_correlation": -0.05954676591781328
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"count": 120,
|
|
"mean": 5.40125,
|
|
"median": 5.32,
|
|
"stdev": 0.708126374445179,
|
|
"ci95": 0.12669988435700497,
|
|
"min": 2.71,
|
|
"max": 7.21,
|
|
"length_correlation": 0.06292781180795073
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"count": 120,
|
|
"mean": 7.180833333333333,
|
|
"median": 7.275,
|
|
"stdev": 0.6776548579052445,
|
|
"ci95": 0.12124783828003574,
|
|
"min": 4.64,
|
|
"max": 8.39,
|
|
"length_correlation": -0.258803259514821
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"count": 120,
|
|
"mean": 5.8375,
|
|
"median": 5.75,
|
|
"stdev": 1.1288994045291274,
|
|
"ci95": 0.20198573187815255,
|
|
"min": 3.07,
|
|
"max": 8.82,
|
|
"length_correlation": -0.1768188568922991
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"count": 120,
|
|
"mean": 7.623833333333334,
|
|
"median": 7.66,
|
|
"stdev": 0.6280598153461667,
|
|
"ci95": 0.11237415925369076,
|
|
"min": 5.93,
|
|
"max": 9.11,
|
|
"length_correlation": 0.09866373846682355
|
|
},
|
|
"DeepSeek-R1": {
|
|
"count": 120,
|
|
"mean": 8.144583333333333,
|
|
"median": 8.14,
|
|
"stdev": 0.6445634257417353,
|
|
"ci95": 0.11532702982037446,
|
|
"min": 5.93,
|
|
"max": 9.43,
|
|
"length_correlation": 0.223446800463047
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"count": 120,
|
|
"mean": 4.980166666666666,
|
|
"median": 5.04,
|
|
"stdev": 0.6795438204572011,
|
|
"ci95": 0.1215858165640322,
|
|
"min": 3.11,
|
|
"max": 7.61,
|
|
"length_correlation": -0.17926892838153338
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"count": 120,
|
|
"mean": 5.103833333333333,
|
|
"median": 5.07,
|
|
"stdev": 0.9138579991317201,
|
|
"ci95": 0.16350994255712,
|
|
"min": 2.64,
|
|
"max": 8.0,
|
|
"length_correlation": -0.4113509255472687
|
|
}
|
|
},
|
|
"calibrated_model_stats": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"count": 120,
|
|
"mean": 7.038882061732457,
|
|
"median": 7.279661016949152,
|
|
"stdev": 1.0944594980631983,
|
|
"ci95": 0.19582365075256514,
|
|
"min": 2.8888888888888893,
|
|
"max": 9.720338983050848,
|
|
"length_correlation": -0.11799301330274381
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"count": 120,
|
|
"mean": 4.970570479602721,
|
|
"median": 5.142259414225942,
|
|
"stdev": 1.3644509554360547,
|
|
"ci95": 0.2441312518545891,
|
|
"min": 1.9777777777777779,
|
|
"max": 7.406779661016948,
|
|
"length_correlation": -0.08214935451522755
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"count": 120,
|
|
"mean": 5.97477187604026,
|
|
"median": 6.06276150627615,
|
|
"stdev": 1.5843047749261256,
|
|
"ci95": 0.28346809130879347,
|
|
"min": 1.8888888888888888,
|
|
"max": 9.59322033898305,
|
|
"length_correlation": 0.03591482605222406
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"count": 120,
|
|
"mean": 6.976388965586416,
|
|
"median": 7.139830508474575,
|
|
"stdev": 1.018702753773122,
|
|
"ci95": 0.18226904936049548,
|
|
"min": 4.333333333333334,
|
|
"max": 9.504237288135592,
|
|
"length_correlation": -0.2040023634235643
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 5.097079540223456,
|
|
"median": 5.05857740585774,
|
|
"stdev": 1.3579300333045548,
|
|
"ci95": 0.24296451084651774,
|
|
"min": 2.0111111111111115,
|
|
"max": 8.5,
|
|
"length_correlation": -0.26130862472962985
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 3.9545145951251075,
|
|
"median": 3.754098360655738,
|
|
"stdev": 1.3915146338781532,
|
|
"ci95": 0.24897355833071158,
|
|
"min": 1.4444444444444446,
|
|
"max": 7.864406779661017,
|
|
"length_correlation": -0.3292350373920184
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 3.16056157004113,
|
|
"median": 2.944444444444444,
|
|
"stdev": 1.1470044146364253,
|
|
"ci95": 0.20522512920843042,
|
|
"min": 0.8222222222222224,
|
|
"max": 6.04602510460251,
|
|
"length_correlation": 0.21187387552721762
|
|
},
|
|
"gemma-7b-it": {
|
|
"count": 120,
|
|
"mean": 2.664260472318124,
|
|
"median": 2.6500000000000004,
|
|
"stdev": 1.1663089835688045,
|
|
"ci95": 0.20867915484504174,
|
|
"min": 0.22222222222222243,
|
|
"max": 5.510460251046025,
|
|
"length_correlation": -0.024225342784098276
|
|
},
|
|
"gemma-2b-it": {
|
|
"count": 120,
|
|
"mean": 2.2565586790233545,
|
|
"median": 2.1666666666666665,
|
|
"stdev": 1.0168478224867272,
|
|
"ci95": 0.18193716004248986,
|
|
"min": 0.0,
|
|
"max": 5.811715481171548,
|
|
"length_correlation": 0.005129635119545636
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 4.433442385376863,
|
|
"median": 4.60655737704918,
|
|
"stdev": 1.5076105334040426,
|
|
"ci95": 0.26974575037874465,
|
|
"min": 1.2222222222222223,
|
|
"max": 7.279661016949152,
|
|
"length_correlation": -0.07285885938620999
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"count": 120,
|
|
"mean": 4.021473968783708,
|
|
"median": 3.78688524590164,
|
|
"stdev": 1.2656064204512008,
|
|
"ci95": 0.2264457205654668,
|
|
"min": 0.5000000000000001,
|
|
"max": 7.177966101694915,
|
|
"length_correlation": 0.0434394866959032
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"count": 120,
|
|
"mean": 7.043768309502954,
|
|
"median": 7.260593220338983,
|
|
"stdev": 1.0372587099686337,
|
|
"ci95": 0.18558913118338627,
|
|
"min": 2.6444444444444444,
|
|
"max": 8.677966101694917,
|
|
"length_correlation": -0.22852472300266763
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"count": 120,
|
|
"mean": 4.809740521821776,
|
|
"median": 4.726775956284153,
|
|
"stdev": 1.882541347610382,
|
|
"ci95": 0.3368293847639776,
|
|
"min": 0.8999999999999999,
|
|
"max": 9.224576271186441,
|
|
"length_correlation": -0.1471370410607534
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"count": 120,
|
|
"mean": 7.675283520081318,
|
|
"median": 7.75,
|
|
"stdev": 0.8566572993642598,
|
|
"ci95": 0.1532754387916653,
|
|
"min": 5.092050209205021,
|
|
"max": 9.59322033898305,
|
|
"length_correlation": 0.07998702965293368
|
|
},
|
|
"DeepSeek-R1": {
|
|
"count": 120,
|
|
"mean": 8.35489474741744,
|
|
"median": 8.360169491525424,
|
|
"stdev": 0.8538147495851818,
|
|
"ci95": 0.15276684210428673,
|
|
"min": 5.092050209205021,
|
|
"max": 10.0,
|
|
"length_correlation": 0.21793652891136667
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"count": 120,
|
|
"mean": 3.2934786570102488,
|
|
"median": 3.1748633879781423,
|
|
"stdev": 1.085699226625789,
|
|
"ci95": 0.19425623931569364,
|
|
"min": 0.9444444444444444,
|
|
"max": 7.686440677966102,
|
|
"length_correlation": -0.18820971899924424
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"count": 120,
|
|
"mean": 3.5512556224401055,
|
|
"median": 3.240437158469946,
|
|
"stdev": 1.4656367020929375,
|
|
"ci95": 0.2622356790623018,
|
|
"min": 0.4222222222222226,
|
|
"max": 8.182203389830509,
|
|
"length_correlation": -0.3705867166259409
|
|
}
|
|
},
|
|
"raw_cross_model_stats": {
|
|
"anova_f": 251.66485889219211,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.1119235407166477,
|
|
"pearson_r": 0.962056318930382,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8735210631012734,
|
|
"kendall_tau": 0.8856209150326798,
|
|
"anova_f": 0.7190424539776917,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.5054197912348398,
|
|
"ci99_overlap_magnitude_sum_norm": 0.8085561238366368,
|
|
"raw_score_range_norm": 0.49002083333333335,
|
|
"kendall_tau_bootstrapped": 0.8641225490196077
|
|
}
|
|
},
|
|
"calibrated_cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9580374221913868,
|
|
"kendall_tau": 0.8882352941176469,
|
|
"normalized_components": {
|
|
"pearson_r": 0.860124740637956,
|
|
"kendall_tau": 0.8758169934640521,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079,
|
|
"ci99_overlap_magnitude_sum_norm": 0.6828356351013829,
|
|
"calibrated_score_range_norm": 0.7622920085492606,
|
|
"kendall_tau_bootstrapped": 0.8631715686274509
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"raw": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__claude-3-5-sonnet-20240620": false,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": true,
|
|
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__claude-3-5-sonnet-20240620": 0.030533908570126833,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.4780312267862339,
|
|
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.42953855137130414,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.08843031464079942,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.4531836757918626,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5306675800894585,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.46984522005092444,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.3551843841931559,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.4911105634526738,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.3445908218876017,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.4383420149622479,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.38757072697520467,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.20466859022543638,
|
|
"gemma-7b-it__gemma-2b-it": 0.27584320125041195
|
|
},
|
|
"ci99_overlap_magnitude_sum": 4.977540780247442,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31534751220512697,
|
|
"emd": {
|
|
"average": 1.350427696078431,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 1.2591666666666668,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.6633333333333333,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.10225000000000005,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.1748333333333332,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 1.8195833333333333,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 2.3145000000000002,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 2.6654999999999998,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 2.95725,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 1.5791666666666666,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 1.7804166666666665,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.08166666666666669,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 1.3441666666666667,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.4438333333333334,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.9629166666666665,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 2.2015000000000002,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 2.0778333333333334,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.6116666666666666,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 1.2085833333333333,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.12133333333333338,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.5664166666666666,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.0553333333333332,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 1.406333333333333,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 1.6980833333333332,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.31999999999999995,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.52125,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 1.2583333333333333,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.3325000000000001,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 1.7013333333333334,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 2.2220833333333334,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 0.9460000000000001,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.8371666666666666,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.6310833333333333,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.5289999999999999,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 1.17075,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 1.6656666666666669,
|
|
"claude-3-opus-20240229__gemma-7b-it": 2.0166666666666666,
|
|
"claude-3-opus-20240229__gemma-2b-it": 2.308416666666667,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.9303333333333332,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.1315833333333334,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.6955,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.6953333333333334,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.0910000000000002,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 1.6117499999999998,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 1.5526666666666669,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 1.4289999999999998,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.1247500000000001,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 1.769,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 2.263916666666667,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 2.614916666666667,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 2.9066666666666667,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 1.5285833333333332,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 1.7298333333333331,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.10824999999999996,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 1.2935833333333333,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.49275,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.0135,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 2.1509166666666664,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 2.02725,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.64475,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.1396666666666668,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 1.4906666666666668,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 1.7824166666666668,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.4043333333333333,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.6055833333333334,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.174,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.3283333333333333,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 1.617,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 2.1377500000000005,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.0266666666666666,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.903,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.49491666666666667,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.8459166666666667,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.1376666666666666,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.26641666666666663,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.1195,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 1.81875,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.5075833333333333,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 2.26175,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 2.7824999999999998,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.3825833333333334,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.27125,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.35099999999999987,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.6427500000000002,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.7353333333333334,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.5389166666666667,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 2.3136666666666668,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 0.9703333333333335,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 2.756666666666667,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 3.2774166666666664,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.1461666666666666,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.25950000000000006,
|
|
"gemma-7b-it__gemma-2b-it": 0.3039166666666667,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.0863333333333334,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 0.8850833333333333,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 2.6646666666666667,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 1.3213333333333335,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 3.107666666666667,
|
|
"gemma-7b-it__DeepSeek-R1": 3.6284166666666664,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.46399999999999997,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.5876666666666666,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 1.3780833333333333,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.1768333333333336,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 2.9564166666666667,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 1.613083333333333,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 3.399416666666667,
|
|
"gemma-2b-it__DeepSeek-R1": 3.920166666666666,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 0.75575,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 0.8794166666666667,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.27641666666666664,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.5783333333333334,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.26766666666666666,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 2.021333333333333,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 2.5420833333333337,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6293333333333333,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.5205000000000001,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 1.7795833333333335,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.5175833333333334,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 2.222583333333333,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 2.743333333333333,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.4344166666666666,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.3330833333333334,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 1.3504999999999998,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.443,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 0.9637499999999999,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 2.2006666666666668,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 2.077,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 1.7863333333333338,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 2.307083333333333,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 0.8615000000000002,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 0.7336666666666667,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.5207499999999998,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 2.6436666666666664,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 2.5199999999999996,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 3.1644166666666664,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 3.04075,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.19883333333333336
|
|
}
|
|
},
|
|
"average_ci95": 0.13979175468006755,
|
|
"modulated_ci95": 0.7883687805128173
|
|
},
|
|
"calibrated": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
"calibrated_score_range": 6.098336068394085,
|
|
"final_judgemark_score": 0.7697031560468162,
|
|
"iteration_stability": {
|
|
"raw": {
|
|
"scoring_stability": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"mean_iter_score": 7.181666666666667,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.11439423790850059
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"mean_iter_score": 5.9225,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1169000641573821
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"mean_iter_score": 6.5328333333333335,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.18868595307782954
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"mean_iter_score": 7.131083333333333,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.08383871752093754
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"mean_iter_score": 6.006833333333334,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.07510760798865698
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"mean_iter_score": 5.3620833333333335,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1018410389883283
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"mean_iter_score": 4.867166666666667,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.085162181878004
|
|
},
|
|
"gemma-7b-it": {
|
|
"mean_iter_score": 4.516166666666667,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1272228469174376
|
|
},
|
|
"gemma-2b-it": {
|
|
"mean_iter_score": 4.2244166666666665,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.05735404180274562
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"mean_iter_score": 5.6025,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.14790340316120762
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"mean_iter_score": 5.40125,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.052260698001036866
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"mean_iter_score": 7.180833333333333,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.08822516962610806
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"mean_iter_score": 5.8375,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.09184981703241937
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"mean_iter_score": 7.623833333333334,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.08676852988139051
|
|
},
|
|
"DeepSeek-R1": {
|
|
"mean_iter_score": 8.144583333333333,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.20265272018899674
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"mean_iter_score": 4.980166666666666,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1592102034697803
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"mean_iter_score": 5.103833333333333,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.15379739558551966
|
|
}
|
|
},
|
|
"ranking_stability": {
|
|
"pairwise_correlation": {
|
|
"1__vs__2": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9558823529411764,
|
|
"p_value": 5.347391697765181e-12
|
|
},
|
|
"1__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9705882352941175,
|
|
"p_value": 8.546830053210383e-13
|
|
},
|
|
"1__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
},
|
|
"1__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"2__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9558823529411764,
|
|
"p_value": 5.347391697765181e-12
|
|
},
|
|
"2__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9411764705882352,
|
|
"p_value": 2.628150241362193e-11
|
|
},
|
|
"2__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9411764705882352,
|
|
"p_value": 2.628150241362193e-11
|
|
},
|
|
"3__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"3__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"4__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8823529411764705,
|
|
"p_value": 3.5743855407137387e-09
|
|
}
|
|
},
|
|
"average_kendall_tau": 0.9323529411764705
|
|
},
|
|
"randomized_average_kendall_tau_by_item": 0.9184735294117646
|
|
},
|
|
"calibrated": {
|
|
"scoring_stability": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"mean_iter_score": 7.038882061732457,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.15783950115828302
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"mean_iter_score": 4.97057047960272,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.21362852261029117
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"mean_iter_score": 5.97477187604026,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.31455229632227294
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"mean_iter_score": 6.976388965586416,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.13161711482782049
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"mean_iter_score": 5.097079540223456,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1365404900361279
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"mean_iter_score": 3.9545145951251075,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.17923578285084113
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"mean_iter_score": 3.16056157004113,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1468416485120612
|
|
},
|
|
"gemma-7b-it": {
|
|
"mean_iter_score": 2.664260472318124,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.16993019534974582
|
|
},
|
|
"gemma-2b-it": {
|
|
"mean_iter_score": 2.2565586790233545,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.11080727566776136
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"mean_iter_score": 4.433442385376863,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.25279458782241115
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"mean_iter_score": 4.021473968783708,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.09669016809797659
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"mean_iter_score": 7.043768309502954,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.13622065843714878
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"mean_iter_score": 4.809740521821776,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.16393733480034264
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"mean_iter_score": 7.675283520081318,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.12985108049520988
|
|
},
|
|
"DeepSeek-R1": {
|
|
"mean_iter_score": 8.35489474741744,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.268310536220573
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"mean_iter_score": 3.2934786570102488,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2658041166397857
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"mean_iter_score": 3.5512556224401055,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.28256941305776895
|
|
}
|
|
},
|
|
"ranking_stability": {
|
|
"pairwise_correlation": {
|
|
"1__vs__2": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"1__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9411764705882352,
|
|
"p_value": 2.628150241362193e-11
|
|
},
|
|
"1__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8676470588235293,
|
|
"p_value": 9.575975226992579e-09
|
|
},
|
|
"1__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"2__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9558823529411764,
|
|
"p_value": 5.347391697765181e-12
|
|
},
|
|
"2__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9117647058823529,
|
|
"p_value": 3.8599058936360526e-10
|
|
},
|
|
"2__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9117647058823529,
|
|
"p_value": 3.8599058936360526e-10
|
|
},
|
|
"3__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"3__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"4__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8823529411764705,
|
|
"p_value": 3.5743855407137387e-09
|
|
}
|
|
},
|
|
"average_kendall_tau": 0.9176470588235294
|
|
},
|
|
"randomized_average_kendall_tau_by_item": 0.9179029411764705
|
|
}
|
|
},
|
|
"ephemeral_runs": [
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9581152460205015,
|
|
"kendall_tau": 0.8705882352941176,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8603841534016716,
|
|
"kendall_tau": 0.8562091503267972,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9586483685892335,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8621612286307785,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9582986178372387,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8609953927907956,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9581454124025393,
|
|
"kendall_tau": 0.8647058823529411,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8604847080084642,
|
|
"kendall_tau": 0.849673202614379,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504326,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9585942833032348,
|
|
"kendall_tau": 0.8794117647058822,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8619809443441161,
|
|
"kendall_tau": 0.8660130718954246,
|
|
"anova_f": 0.7369479101001236,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350438,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.957889580302259,
|
|
"kendall_tau": 0.8735294117647058,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8596319343408634,
|
|
"kendall_tau": 0.8594771241830064,
|
|
"anova_f": 0.7369479101001251,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9580092679939227,
|
|
"kendall_tau": 0.9029411764705882,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8600308933130758,
|
|
"kendall_tau": 0.8921568627450981,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9587213678567964,
|
|
"kendall_tau": 0.9029411764705881,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8624045595226547,
|
|
"kendall_tau": 0.892156862745098,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9573541486270019,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8578471620900062,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9559481957127732,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8531606523759108,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.958703626076749,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8623454202558299,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.957393182748313,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.85797727582771,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9566317943571925,
|
|
"kendall_tau": 0.8705882352941176,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8554393145239749,
|
|
"kendall_tau": 0.8562091503267972,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504326,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9572258694596579,
|
|
"kendall_tau": 0.8764705882352941,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8574195648655265,
|
|
"kendall_tau": 0.8627450980392157,
|
|
"anova_f": 0.7369479101001236,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9550029991897743,
|
|
"kendall_tau": 0.8794117647058822,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8500099972992476,
|
|
"kendall_tau": 0.8660130718954246,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9572468384461269,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8574894614870897,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9562493164391831,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8541643881306104,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9578106378411286,
|
|
"kendall_tau": 0.8999999999999999,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8593687928037621,
|
|
"kendall_tau": 0.8888888888888888,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9558424772370746,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8528082574569154,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9571608796859989,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8572029322866632,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.957892252800124,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8596408426670801,
|
|
"kendall_tau": 0.8856209150326798,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350438,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9581030293756713,
|
|
"kendall_tau": 0.8794117647058823,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8603434312522376,
|
|
"kendall_tau": 0.8660130718954249,
|
|
"anova_f": 0.7369479101001251,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9589978656186107,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8633262187287023,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350435,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9581285738686223,
|
|
"kendall_tau": 0.8882352941176471,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8604285795620743,
|
|
"kendall_tau": 0.8758169934640524,
|
|
"anova_f": 0.7369479101001243,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504366,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9586364974274908,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.862121658091636,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001247,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9577914569496006,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8593048564986685,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9573575712282261,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8578585707607536,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9574915238164328,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8583050793881093,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9574577120574923,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8581923735249744,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9584377525136883,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8614591750456279,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9583037480685486,
|
|
"kendall_tau": 0.8823529411764705,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8610124935618286,
|
|
"kendall_tau": 0.8692810457516338,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9588697553977031,
|
|
"kendall_tau": 0.8794117647058823,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8628991846590105,
|
|
"kendall_tau": 0.8660130718954249,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504326,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9576467511709901,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8588225039033003,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001236,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9573568477788098,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8578561592626994,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9586919151966741,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8623063839889136,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9571512314671816,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8571707715572721,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350435,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.957890135357559,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8596337845251966,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001243,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9581110583618601,
|
|
"kendall_tau": 0.8823529411764705,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8603701945395336,
|
|
"kendall_tau": 0.8692810457516338,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9572716342948216,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8575721143160722,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9573583592970925,
|
|
"kendall_tau": 0.8764705882352941,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8578611976569749,
|
|
"kendall_tau": 0.8627450980392157,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9584861226881956,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8616204089606518,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350435,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9582347125705659,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8607823752352197,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.7369479101001243,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9578527359738251,
|
|
"kendall_tau": 0.8852941176470587,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8595091199127505,
|
|
"kendall_tau": 0.8725490196078429,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350435,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9564358105756929,
|
|
"kendall_tau": 0.8794117647058823,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8547860352523098,
|
|
"kendall_tau": 0.8660130718954249,
|
|
"anova_f": 0.7369479101001243,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9590741978422709,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8635806594742363,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9579700683686015,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8599002278953385,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.958821636556474,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.86273878852158,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9553747194651808,
|
|
"kendall_tau": 0.8705882352941176,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8512490648839361,
|
|
"kendall_tau": 0.8562091503267972,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9570688504216035,
|
|
"kendall_tau": 0.8705882352941176,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8568961680720117,
|
|
"kendall_tau": 0.8562091503267972,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9577320550146401,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8591068500488004,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9585426524551258,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8618088415170861,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9591466415899605,
|
|
"kendall_tau": 0.8999999999999999,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8638221386332019,
|
|
"kendall_tau": 0.8888888888888888,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9594158898559625,
|
|
"kendall_tau": 0.8911764705882353,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8647196328532083,
|
|
"kendall_tau": 0.8790849673202615,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9581027688121884,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8603425627072946,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9583688172006113,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8612293906687043,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9585047519676084,
|
|
"kendall_tau": 0.8999999999999999,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8616825065586948,
|
|
"kendall_tau": 0.8888888888888888,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350433,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9565449754106187,
|
|
"kendall_tau": 0.8794117647058823,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8551499180353956,
|
|
"kendall_tau": 0.8660130718954249,
|
|
"anova_f": 0.7369479101001237,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9580424598549911,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8601415328499705,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9559752545009476,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8532508483364919,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9576981651354667,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.858993883784889,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9580487486859879,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8601624956199596,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9584774799106441,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8615915997021472,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9580288323727177,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8600961079090592,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9562277436399984,
|
|
"kendall_tau": 0.876470588235294,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8540924787999947,
|
|
"kendall_tau": 0.8627450980392155,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9574924086889268,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8583080289630894,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350438,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9591292415513666,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8637641385045554,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001251,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9584072047425742,
|
|
"kendall_tau": 0.876470588235294,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8613573491419142,
|
|
"kendall_tau": 0.8627450980392155,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9578742075117271,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8595806917057571,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9587425423494627,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8624751411648756,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504326,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.95642156448442,
|
|
"kendall_tau": 0.8705882352941176,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8547385482813998,
|
|
"kendall_tau": 0.8562091503267972,
|
|
"anova_f": 0.7369479101001236,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9582522716397858,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8608409054659526,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9583558726146496,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8611862420488321,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9571224736088342,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8570749120294473,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9585552010120911,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8618506700403038,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9576843987919221,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8589479959730738,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9578855356706557,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.859618452235519,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350435,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9585612579991936,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8618708599973119,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001243,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9562328952873171,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8541096509577238,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9583849573526105,
|
|
"kendall_tau": 0.8911764705882352,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8612831911753684,
|
|
"kendall_tau": 0.8790849673202614,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9571905327595688,
|
|
"kendall_tau": 0.8794117647058823,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8573017758652293,
|
|
"kendall_tau": 0.8660130718954249,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9583702780898697,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8612342602995658,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9558898750902126,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8529662503007087,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9579360475647603,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8597868252158676,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350435,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9568980856092495,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8563269520308316,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001243,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.958527027458611,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8617567581953699,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9582987588594254,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8609958628647514,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.956971065074383,
|
|
"kendall_tau": 0.9,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8565702169146101,
|
|
"kendall_tau": 0.888888888888889,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9586342311476586,
|
|
"kendall_tau": 0.8999999999999999,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8621141038255286,
|
|
"kendall_tau": 0.8888888888888888,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9596303541330921,
|
|
"kendall_tau": 0.8852941176470587,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8654345137769736,
|
|
"kendall_tau": 0.8725490196078429,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9574098132309395,
|
|
"kendall_tau": 0.8823529411764705,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8580327107697985,
|
|
"kendall_tau": 0.8692810457516338,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9579034487768185,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8596781625893952,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9581880662958705,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8606268876529017,
|
|
"kendall_tau": 0.869281045751634,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9579999142537933,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.859999714179311,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.958523649628437,
|
|
"kendall_tau": 0.8794117647058822,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8617454987614566,
|
|
"kendall_tau": 0.8660130718954246,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9568818218159676,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8562727393865587,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504343,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9575438033611618,
|
|
"kendall_tau": 0.8764705882352941,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8584793445372062,
|
|
"kendall_tau": 0.8627450980392157,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350436,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9577160197532969,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8590533991776563,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001246,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9562334967784376,
|
|
"kendall_tau": 0.8941176470588235,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8541116559281254,
|
|
"kendall_tau": 0.8823529411764706,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.93176853504355,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9592307950995467,
|
|
"kendall_tau": 0.888235294117647,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8641026503318223,
|
|
"kendall_tau": 0.8758169934640523,
|
|
"anova_f": 0.7369479101001244,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
},
|
|
{
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.26,
|
|
"max": 9.43,
|
|
"mean": 5.978,
|
|
"median": 5.875,
|
|
"stdev": 1.363,
|
|
"p10": 4.22,
|
|
"p25": 4.96,
|
|
"p75": 7.07,
|
|
"p90": 7.86
|
|
},
|
|
"cross_model_stats": {
|
|
"anova_f": 257.9317685350435,
|
|
"anova_p": 0.0,
|
|
"kw_stat": 1357.1321131387729,
|
|
"kw_p": 2.665121927943416e-279,
|
|
"std_dev_across_models": 1.8051081167539373,
|
|
"pearson_r": 0.9572482755777107,
|
|
"kendall_tau": 0.8852941176470588,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8574942519257025,
|
|
"kendall_tau": 0.8725490196078431,
|
|
"anova_f": 0.7369479101001243,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": false,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": false,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": false,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 0.75,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.0,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.036487816928959305,
|
|
"gemini-1.5-pro-002__claude-3-5-sonnet-20240620": 0.7317029426362467,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.682840263350756,
|
|
"gemini-1.5-pro-001__claude-3-opus-20240229": 0.0,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.16006316646009644,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.8337018112160095,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.9625108659782562,
|
|
"Mistral-Large-Instruct-2411__Mixtral-8x22B-Instruct-v0.1": 0.8194421757270955,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.566172806133221,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.8702336697492035,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.6044867420510771,
|
|
"databricks/dbrx-instruct__gpt-3.5-turbo-0125": 0.6421046112471536,
|
|
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6545797104285263,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.3196275632943628,
|
|
"gemma-7b-it__gemma-2b-it": 0.36231934216307904
|
|
},
|
|
"ci99_overlap_magnitude_sum": 8.246273487364045,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.31566886270330186,
|
|
"emd": {
|
|
"average": 2.195135707105006,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 2.0683115821297364,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 1.0825423890820267,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.15828143521107407,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.941802521509001,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.084367466607349,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.8783204916913276,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 4.374621589414334,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.7823233827091025,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.6054396763555934,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 3.0174080929487483,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.12137994929416104,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 2.229141539910681,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6385201024166578,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.316012685684984,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.7454034047222082,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.4876264392923506,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.0056828779190217,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 2.005818485983696,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.19133594251325994,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0236830031216806,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.8100089095615908,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.306310007284597,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.714011800579366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5371280942258568,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.9490965108190121,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.073197829900234,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5243467296410962,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.7047130404785973,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 3.3843242678147205,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6817528395416237,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.442831806315157,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 1.0433543776817493,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.8810256691501379,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.0202572809151533,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.814210305999131,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.310511403722137,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.7182131970169063,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.541329490663397,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.9532979072565526,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 1.1293777893948964,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 1.1650313542184845,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.7005116440410575,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 2.3801228713771803,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 2.6812932190300116,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.4235162536001544,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.8799450185832995,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.021874370461309,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.815827395545287,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 4.312128493268293,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.719830286563062,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.542946580209553,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.9549149968027093,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.15634084399660408,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 2.16664844376464,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.6988945544949016,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.3785057818310245,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.682910308576168,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.425133343146311,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1425649450983486,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9365179701823263,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.432819067905332,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.8405208612001016,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.6636371548465927,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 1.0756055714397479,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.9466887692794979,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.5310445475603683,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.5782039798578618,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.2578152071939845,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.803600883213207,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.5458239177833502,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7939530250839777,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2902541228069837,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.697955916101753,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.5092306778851521,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18718904631635067,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.0892537143778465,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.8909666674374094,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.72076892495621,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.400380152292334,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6617766788555992,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.41978439641381526,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.4963010977230058,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 0.9040028910177753,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2728808153357336,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8662827691129487,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.883206739461824,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.6491789517806463,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.5147219500401885,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.194333177376311,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19476928178002154,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.41606442276934635,
|
|
"gemma-7b-it__gemma-2b-it": 0.4229078989820468,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.769181913058739,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.357213496465584,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 4.379507837184829,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.145480049503652,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 5.011023047763194,
|
|
"gemma-7b-it__DeepSeek-R1": 5.690634275099317,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.629218184692125,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.8869951501219818,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.1768837063535087,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.7649152897603535,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.787209630479599,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.5531818427984216,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 5.418724841057964,
|
|
"gemma-2b-it__DeepSeek-R1": 6.0983360683940875,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 1.0369199779868945,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.2946969434167515,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4954869351116738,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 2.6103259241260908,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.41438556813890176,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.2418411347044547,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 3.9214523620405775,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1485952412354152,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.9099410002248931,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 3.022294340719246,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.8786369234084384,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.65380955129761,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 4.333420778633732,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 0.743877295452053,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.5191123373135098,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 2.2431379571727033,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6315152105783643,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 1.311126437914487,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.7502896524927047,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.492512687062848,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.8655429982595417,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 3.5451542255956645,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.520891494441157,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.2584848993816702,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.6796112273361227,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.3818048630710695,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.124027897641212,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.061416090407192,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 4.803639124977336,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.3412954839483753
|
|
}
|
|
},
|
|
"average_ci95": 0.22203622015971514,
|
|
"modulated_ci95": 0.5996755593310718
|
|
}
|
|
}
|
|
],
|
|
"raw_score_range": 3.920166666666667,
|
|
"final_judgemark_score_raw": 0.7177047101231313,
|
|
"final_judgemark_score_elements_raw": {
|
|
"norm_stability_between_iterations": 0.8641225490196077,
|
|
"norm_correlation_with_lmsys_arena": 0.8856209150326798,
|
|
"norm_std_dev_between_models": 0.5054197912348398,
|
|
"norm_kruskall_wallis": 0.9047547420925153,
|
|
"norm_ci99_adjacent_overlap": 0.8085561238366368,
|
|
"norm_score_range": 0.49002083333333335,
|
|
"norm_intra_model_ci95": 0.7883687805128173,
|
|
"norm_earth_movers_distance": 0.33760692401960773
|
|
},
|
|
"final_judgemark_score_elements_calibrated": {
|
|
"norm_stability_between_iterations": 0.8631715686274509,
|
|
"norm_correlation_with_lmsys_arena": 0.8758169934640521,
|
|
"norm_std_dev_between_models": 0.8205036894336079,
|
|
"norm_kruskall_wallis": 0.9047547420925153,
|
|
"norm_ci99_adjacent_overlap": 0.6828356351013829,
|
|
"norm_score_range": 0.7622920085492606,
|
|
"norm_intra_model_ci95": 0.5996755593310718,
|
|
"norm_earth_movers_distance": {
|
|
"pearson_r": 0.860124740637956,
|
|
"kendall_tau": 0.8758169934640521,
|
|
"anova_f": 0.7369479101001241,
|
|
"kw_stat": 0.9047547420925153,
|
|
"std_dev": 0.8205036894336079,
|
|
"ci99_overlap_magnitude_sum_norm": 0.6828356351013829,
|
|
"calibrated_score_range_norm": 0.7622920085492606,
|
|
"kendall_tau_bootstrapped": 0.8631715686274509
|
|
}
|
|
}
|
|
} |