Files
Judgemark-v2lp/results/stats/claude-3-5-haiku-20241022.json
T
2025-01-31 18:03:33 +11:00

1128 lines
56 KiB
JSON

{
"judge_model": "anthropic/claude-3.5-haiku-20241022",
"start_time": "2025-01-29T16:12:12.197649",
"status": "completed",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"end_time": "2025-01-31T15:23:44.186814",
"raw_score_distribution": {
"count": 2039,
"min": 1.5,
"max": 10.0,
"mean": 5.676,
"median": 5.11,
"stdev": 2.099,
"p10": 3.18,
"p25": 3.89,
"p75": 7.86,
"p90": 8.64
},
"calibration_config": {
"method": "piecewise_landmark",
"in_landmarks": [
1.5,
3.89,
5.11,
7.86,
10.0
],
"out_landmarks": [
0,
3,
5,
7,
10
]
},
"calibrated_score_distribution": {
"count": 2039,
"min": 0.0,
"max": 10.0,
"mean": 5.009,
"median": 5.0,
"stdev": 2.251,
"p10": 2.109,
"p25": 3.0,
"p75": 7.0,
"p90": 8.093
},
"raw_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 7.253916666666667,
"median": 7.675,
"stdev": 1.3887956791318694,
"ci95": 0.24848707560056899,
"min": 3.68,
"max": 9.04,
"length_correlation": -0.07048202023783438
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 5.440166666666666,
"median": 4.96,
"stdev": 1.4781023396945978,
"ci95": 0.264466064625619,
"min": 2.46,
"max": 8.68,
"length_correlation": -0.02178822316069151
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 6.898833333333333,
"median": 7.23,
"stdev": 1.5481782101257344,
"ci95": 0.27700422871645364,
"min": 2.75,
"max": 8.89,
"length_correlation": -0.06888953779607383
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 7.432833333333333,
"median": 7.715,
"stdev": 1.1108208663861512,
"ci95": 0.19875107098326153,
"min": 4.18,
"max": 8.86,
"length_correlation": -0.1713646455193349
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 5.424833333333333,
"median": 5.0,
"stdev": 1.4499527568851613,
"ci95": 0.25942946520585025,
"min": 3.57,
"max": 8.96,
"length_correlation": -0.22274115681728002
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 4.427833333333333,
"median": 4.075,
"stdev": 1.4908506304257636,
"ci95": 0.26674702325062954,
"min": 1.5,
"max": 8.21,
"length_correlation": -0.4697149143257062
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 3.739916666666667,
"median": 3.71,
"stdev": 0.8321537267416231,
"ci95": 0.1488911933664698,
"min": 2.03,
"max": 6.54,
"length_correlation": 0.1825863419720324
},
"gemma-7b-it": {
"count": 120,
"mean": 3.878416666666667,
"median": 3.82,
"stdev": 0.9823655174139214,
"ci95": 0.175767492843594,
"min": 1.79,
"max": 7.96,
"length_correlation": -0.027797494317579826
},
"gemma-2b-it": {
"count": 120,
"mean": 3.30925,
"median": 3.23,
"stdev": 0.7546941753580239,
"ci95": 0.13503192112804072,
"min": 1.5,
"max": 6.18,
"length_correlation": 0.02366957388804071
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 4.696416666666667,
"median": 4.39,
"stdev": 1.3728721018810344,
"ci95": 0.2456379861314592,
"min": 2.21,
"max": 9.04,
"length_correlation": -0.33707687962616484
},
"c4ai-command-r-08-2024": {
"count": 119,
"mean": 4.840840336134454,
"median": 4.39,
"stdev": 1.41229623088888,
"ci95": 0.253751367114031,
"min": 2.61,
"max": 8.21,
"length_correlation": -0.08569901001243549
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 7.792333333333334,
"median": 8.055,
"stdev": 0.9199595655467127,
"ci95": 0.16460165130724508,
"min": 4.43,
"max": 9.18,
"length_correlation": -0.28458580217358836
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 6.125083333333333,
"median": 6.035,
"stdev": 1.8287889163488016,
"ci95": 0.3272118545172245,
"min": 2.57,
"max": 9.18,
"length_correlation": -0.10665074330370435
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 8.422583333333334,
"median": 8.61,
"stdev": 0.6892542503087141,
"ci95": 0.12332323291179563,
"min": 4.96,
"max": 9.36,
"length_correlation": 0.08440518520645134
},
"DeepSeek-R1": {
"count": 120,
"mean": 8.756583333333333,
"median": 8.805,
"stdev": 0.49261861743228547,
"ci95": 0.08814065414479233,
"min": 5.68,
"max": 10.0,
"length_correlation": 0.25176480678723895
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 3.8186666666666667,
"median": 3.82,
"stdev": 0.6507577139311499,
"ci95": 0.11643532860092433,
"min": 1.96,
"max": 5.18,
"length_correlation": -0.11467115456346932
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 4.220166666666667,
"median": 4.14,
"stdev": 1.120883492572519,
"ci95": 0.20055150325093338,
"min": 2.07,
"max": 7.93,
"length_correlation": -0.4142646956229799
}
},
"calibrated_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 6.68195910819885,
"median": 6.865454545454545,
"stdev": 1.3244819635856115,
"ci95": 0.23697989183176132,
"min": 2.7364016736401675,
"max": 8.654205607476634,
"length_correlation": -0.06877353946250991
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 4.898806551514402,
"median": 4.754098360655737,
"stdev": 1.5234359912159727,
"ci95": 0.27257728405271525,
"min": 1.2050209205020919,
"max": 8.149532710280374,
"length_correlation": -0.06988045061840498
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 6.31789005779875,
"median": 6.541818181818182,
"stdev": 1.5182047148030473,
"ci95": 0.2716412899413866,
"min": 1.5690376569037656,
"max": 8.44392523364486,
"length_correlation": -0.07188058734630212
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 6.827328927392509,
"median": 6.8945454545454545,
"stdev": 1.0119712966020384,
"ci95": 0.18106463885425125,
"min": 3.475409836065573,
"max": 8.401869158878505,
"length_correlation": -0.18842637322511324
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 4.894965193794008,
"median": 4.819672131147541,
"stdev": 1.4920047824219695,
"ci95": 0.26695352724444604,
"min": 2.598326359832636,
"max": 8.542056074766355,
"length_correlation": -0.24428407226734714
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 3.6586979153256927,
"median": 3.30327868852459,
"stdev": 1.6840620535618311,
"ci95": 0.30131693315826746,
"min": 0.0,
"max": 7.490654205607477,
"length_correlation": -0.4776298070348816
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 2.8951801158578028,
"median": 2.7740585774058575,
"stdev": 1.1469456418596848,
"ci95": 0.205214613424405,
"min": 0.6652719665271964,
"max": 6.04,
"length_correlation": 0.16995799101796538
},
"gemma-7b-it": {
"count": 120,
"mean": 3.050149211896419,
"median": 2.9121338912133887,
"stdev": 1.2561522388629545,
"ci95": 0.2247541528493802,
"min": 0.36401673640167365,
"max": 7.14018691588785,
"length_correlation": -0.04024365106152926
},
"gemma-2b-it": {
"count": 120,
"mean": 2.3021446954627556,
"median": 2.1715481171548117,
"stdev": 1.0064325795222053,
"ci95": 0.18007363662805842,
"min": 0.0,
"max": 5.778181818181817,
"length_correlation": 0.021897153751634545
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 4.054006244690861,
"median": 3.81967213114754,
"stdev": 1.5563545288638563,
"ci95": 0.27846715775845937,
"min": 0.8912133891213387,
"max": 8.654205607476634,
"length_correlation": -0.3196801333233657
},
"c4ai-command-r-08-2024": {
"count": 119,
"mean": 4.18075127921145,
"median": 3.81967213114754,
"stdev": 1.5255683090409418,
"ci95": 0.27410329049829346,
"min": 1.3933054393305437,
"max": 7.490654205607477,
"length_correlation": -0.059730251207000255
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 7.161947242456741,
"median": 7.273364485981308,
"stdev": 0.8908156973553131,
"ci95": 0.1593871516602587,
"min": 3.8852459016393435,
"max": 8.850467289719626,
"length_correlation": -0.3055069842706573
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 5.559547746582264,
"median": 5.672727272727272,
"stdev": 1.8930723704948602,
"ci95": 0.3387136238345387,
"min": 1.343096234309623,
"max": 8.850467289719626,
"length_correlation": -0.09907920981363108
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 7.8657813209465575,
"median": 8.051401869158877,
"stdev": 0.7743872359970316,
"ci95": 0.13855545675055259,
"min": 4.754098360655737,
"max": 9.102803738317757,
"length_correlation": 0.07999102858136686
},
"DeepSeek-R1": {
"count": 120,
"mean": 8.277186632681959,
"median": 8.324766355140186,
"stdev": 0.5947798174773566,
"ci95": 0.1064196121085102,
"min": 5.414545454545454,
"max": 10.0,
"length_correlation": 0.24509768511348035
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 2.993805249144161,
"median": 2.9121338912133887,
"stdev": 0.9205211355431522,
"ci95": 0.16470212892843664,
"min": 0.5774058577405857,
"max": 5.050909090909091,
"length_correlation": -0.11256420528408954
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 3.524101240725026,
"median": 3.40983606557377,
"stdev": 1.4333947798904427,
"ci95": 0.2564668672203416,
"min": 0.715481171548117,
"max": 7.0981308411214945,
"length_correlation": -0.4174065826724629
}
},
"raw_cross_model_stats": {
"anova_f": 258.83548733876137,
"anova_p": 0.0,
"kw_stat": 1336.5069278575731,
"kw_p": 7.209952841084234e-275,
"std_dev_across_models": 1.7196086544808689,
"pearson_r": 0.9668283640707734,
"kendall_tau": 0.9235294117647058,
"normalized_components": {
"pearson_r": 0.8894278802359115,
"kendall_tau": 0.9150326797385621,
"anova_f": 0.7395299638250324,
"kw_stat": 0.8910046185717154,
"std_dev": 0.7816402974913039,
"ci99_overlap_magnitude_sum_norm": 0.6963648765862853,
"raw_score_range_norm": 0.6809166666666666,
"kendall_tau_bootstrapped": 0.9039705882352941
}
},
"calibrated_cross_model_stats": {
"anova_f": 247.3466828338453,
"anova_p": 0.0,
"kw_stat": 1336.5069278575731,
"kw_p": 7.209952841084234e-275,
"std_dev_across_models": 1.8301981732431505,
"pearson_r": 0.9707622427834304,
"kendall_tau": 0.9235294117647058,
"normalized_components": {
"pearson_r": 0.9025408092781012,
"kendall_tau": 0.9150326797385621,
"anova_f": 0.7067048080967009,
"kw_stat": 0.8910046185717154,
"std_dev": 0.8319082605650683,
"ci99_overlap_magnitude_sum_norm": 0.6647865930728065,
"calibrated_score_range_norm": 0.7468802421524003,
"kendall_tau_bootstrapped": 0.898377450980392
}
},
"separability_metrics": {
"raw": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gemma-7b-it": true,
"gemma-7b-it__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-2b-it": false
},
"adjacent_overlap_fraction": 0.8125,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.08285832417554673,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.0,
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.35677647163276927,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.702722968768688,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.6808165904574217,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.4173399180118764,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": 0.4814567943468333,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 1.0174208435071073,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.42763931854793036,
"c4ai-command-r-08-2024__Mixtral-8x22B-Instruct-v0.1": 0.840021532039918,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": 0.7414801782984757,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.7135179543037022,
"databricks/dbrx-instruct__gemma-7b-it": 0.40008686042529673,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.4590574459876633,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.4442876252237191,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.12903038302963177
},
"ci99_overlap_magnitude_sum": 7.89451320875658,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.3042931017548996,
"emd": {
"average": 2.0820636842972484,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 1.81375,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.3550833333333334,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.25708333333333333,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.8290833333333334,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 2.826083333333334,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.514,
"claude-3-5-sonnet-20240620__gemma-7b-it": 3.3755,
"claude-3-5-sonnet-20240620__gemma-2b-it": 3.9446666666666665,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.5575,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 2.413076330532213,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.5452499999999999,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 1.1316666666666666,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 1.1686666666666667,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.5026666666666666,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.43525,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.03375,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.4586666666666663,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 1.9926666666666668,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.15450000000000008,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.0123333333333333,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.70025,
"claude-3-haiku-20240307__gemma-7b-it": 1.5617500000000004,
"claude-3-haiku-20240307__gemma-2b-it": 2.130916666666667,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.7555833333333334,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.6048067226890756,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.3521666666666663,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.7394166666666667,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.9824166666666665,
"claude-3-haiku-20240307__DeepSeek-R1": 3.3164166666666666,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.6215000000000004,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.22,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.5393333333333333,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 1.4953333333333334,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.471,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 3.158916666666666,
"claude-3-opus-20240229__gemma-7b-it": 3.020416666666666,
"claude-3-opus-20240229__gemma-2b-it": 3.589583333333333,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 2.2049166666666666,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 2.0579929971988795,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.8935,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.7860833333333332,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.52375,
"claude-3-opus-20240229__DeepSeek-R1": 1.8577500000000002,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 3.0801666666666656,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.6786666666666665,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 2.009666666666667,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.005,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.6929166666666666,
"gemini-1.5-pro-001__gemma-7b-it": 3.5544166666666666,
"gemini-1.5-pro-001__gemma-2b-it": 4.123583333333333,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.739416666666667,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.5919929971988793,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.3594999999999999,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 1.3207499999999999,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.9897499999999999,
"gemini-1.5-pro-001__DeepSeek-R1": 1.32375,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.614166666666667,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.2126666666666663,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.9970000000000001,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.6849166666666668,
"Llama-3-70b-chat-hf__gemma-7b-it": 1.5464166666666668,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.1155833333333334,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.72975,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.5839929971988795,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 2.3674999999999997,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.8372500000000003,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.99775,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.3317499999999995,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.6061666666666667,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.2046666666666668,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7039166666666665,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.56025,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.118583333333333,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.3227499999999999,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.41362885154061624,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.3645,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.6972500000000004,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 3.99475,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.32875,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.6368333333333331,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.3318333333333333,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.14666666666666672,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.4306666666666666,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.9565000000000001,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 1.1009236694677873,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 4.052416666666666,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 2.385166666666667,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.682666666666667,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.016666666666667,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.19974999999999998,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.4959166666666668,
"gemma-7b-it__gemma-2b-it": 0.5691666666666667,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 0.8180000000000001,
"gemma-7b-it__c4ai-command-r-08-2024": 0.9624236694677872,
"gemma-7b-it__gemini-1.5-pro-002": 3.9139166666666663,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.246666666666667,
"gemma-7b-it__gpt-4o-2024-11-20": 4.5441666666666665,
"gemma-7b-it__DeepSeek-R1": 4.878166666666666,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.21375000000000005,
"gemma-7b-it__databricks/dbrx-instruct": 0.3650833333333333,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 1.3871666666666667,
"gemma-2b-it__c4ai-command-r-08-2024": 1.5315903361344538,
"gemma-2b-it__gemini-1.5-pro-002": 4.483083333333333,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.815833333333333,
"gemma-2b-it__gpt-4o-2024-11-20": 5.113333333333332,
"gemma-2b-it__DeepSeek-R1": 5.447333333333333,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.53425,
"gemma-2b-it__databricks/dbrx-instruct": 0.9109166666666667,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.18884243697478992,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 3.0959166666666667,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.4328333333333332,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.726166666666667,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 4.060166666666667,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.87775,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.4762499999999999,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 2.9514929971988795,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 1.2953970588235295,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.58174299719888,
"c4ai-command-r-08-2024__DeepSeek-R1": 3.9157429971988797,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 1.0221736694677872,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.620673669467787,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 1.6684166666666664,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.6302500000000001,
"gemini-1.5-pro-002__DeepSeek-R1": 0.9642500000000003,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.973666666666666,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.572166666666666,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.2975,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 2.6315000000000004,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 2.3064166666666672,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.904916666666667,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.3340000000000001,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.603916666666667,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.202416666666666,
"DeepSeek-R1__gpt-3.5-turbo-0125": 4.937916666666666,
"DeepSeek-R1__databricks/dbrx-instruct": 4.536416666666666,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.4723333333333334
}
},
"average_ci95": 0.20554288904111134,
"modulated_ci95": 0.6198884378310157
},
"calibrated": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": false,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": false,
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": false,
"c4ai-command-r-08-2024__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__gemma-7b-it": true,
"gemma-7b-it__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gemma-2b-it": false
},
"adjacent_overlap_fraction": 0.75,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.07151353597333454,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.0,
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.33651359701712025,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.6787205014665805,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.6385748356332774,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.4448490707751649,
"Mistral-Large-Instruct-2411__claude-3-haiku-20240307": 0.5442953101839239,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 1.0524898747377307,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.3523702737031549,
"c4ai-command-r-08-2024__Mixtral-8x22B-Instruct-v0.1": 0.9625359498377488,
"Mixtral-8x22B-Instruct-v0.1__Mixtral-8x7B-Instruct-v0.1": 0.747618852610417,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.9649613800371584,
"databricks/dbrx-instruct__gemma-7b-it": 0.4746779749820682,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.6493539337511569,
"gpt-3.5-turbo-0125__Llama-2-13b-chat-hf": 0.6305909775497112,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.16648251184848162
},
"ci99_overlap_magnitude_sum": 8.71554858010703,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.30947149123068435,
"emd": {
"average": 2.2294359739608978,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 1.7831525566844486,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.3640690504001004,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.253712498321385,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 1.7869939144048417,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 3.023261192873157,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 3.7867789923410475,
"claude-3-5-sonnet-20240620__gemma-7b-it": 3.631809896302432,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.379814412736095,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 2.627952863507989,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 2.5012078289873996,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.4895675735102275,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 1.126383324233409,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 1.1838222127477074,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.5952275244831093,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.6881538590546894,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 3.1578578674738234,
"claude-3-haiku-20240307__claude-3-opus-20240229": 1.4190835062843483,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 1.9285223758781078,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.17895844357121177,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 1.2401086361887086,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 2.003626435656599,
"claude-3-haiku-20240307__gemma-7b-it": 1.8486573396179824,
"claude-3-haiku-20240307__gemma-2b-it": 2.5966618560516457,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.8613890918702699,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.7249344256379251,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 2.2631406909423397,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.7291512369088664,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 2.9669747694321558,
"claude-3-haiku-20240307__DeepSeek-R1": 3.378380081167558,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.9050013023702403,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.3747053107893752,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.5169155051077782,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 1.4498741855548136,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 2.6591921424730565,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 3.422709941940947,
"claude-3-opus-20240229__gemma-7b-it": 3.2677408459023307,
"claude-3-opus-20240229__gemma-2b-it": 4.015745362335995,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 2.2673884860050855,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 2.1371387785872993,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.8440571846579914,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.7756320308426543,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 1.5478912631478077,
"claude-3-opus-20240229__DeepSeek-R1": 1.9592965748832099,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 3.3240848086545887,
"claude-3-opus-20240229__databricks/dbrx-instruct": 2.7937888170737235,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.9347001821966314,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 3.1686310120668164,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 3.9321488115347067,
"gemini-1.5-pro-001__gemma-7b-it": 3.77717971549609,
"gemini-1.5-pro-001__gemma-2b-it": 4.525184231929754,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 2.7775282901782843,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 2.646577648181059,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.3346183150642319,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 1.286005479875666,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 1.0384523935540482,
"gemini-1.5-pro-001__DeepSeek-R1": 1.4498577052894503,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.8335236782483486,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 3.303227686667483,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.2362672784683157,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.9997850779362063,
"Llama-3-70b-chat-hf__gemma-7b-it": 1.8448159818975898,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.592820498331253,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.8428281079816526,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.7142139145825581,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 2.266982048662732,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.8368051560760894,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 2.970816127152549,
"Llama-3-70b-chat-hf__DeepSeek-R1": 3.382221438887951,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.9011599446498475,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.3708639530689823,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7836014814762582,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 0.6221470297891065,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.3565532198629375,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.43470226875910717,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.522505617514482,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 3.5032493271310488,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.9008498312565714,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 4.207083405620865,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 4.618488717356267,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.699620699654335,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.32220785049293155,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.16586033229167418,
"Llama-2-13b-chat-hf__gemma-2b-it": 0.5930354203950472,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.1588261288330577,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 1.2855711633536477,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 4.266767126598938,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 2.6643676307244615,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 4.970601205088755,
"Llama-2-13b-chat-hf__DeepSeek-R1": 5.382006516824156,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.25838266331606063,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.6485863968337509,
"gemma-7b-it__gemma-2b-it": 0.7480045164336633,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.0038570327944416,
"gemma-7b-it__c4ai-command-r-08-2024": 1.1306020673150317,
"gemma-7b-it__gemini-1.5-pro-002": 4.111798030560323,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.509398534685845,
"gemma-7b-it__gpt-4o-2024-11-20": 4.815632109050139,
"gemma-7b-it__DeepSeek-R1": 5.227037420785541,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.2513138966754807,
"gemma-7b-it__databricks/dbrx-instruct": 0.5033140512741554,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 1.751861549228105,
"gemma-2b-it__c4ai-command-r-08-2024": 1.8786065837486947,
"gemma-2b-it__gemini-1.5-pro-002": 4.859802546993985,
"gemma-2b-it__Mistral-Large-Instruct-2411": 3.2574030511195082,
"gemma-2b-it__gpt-4o-2024-11-20": 5.563636625483802,
"gemma-2b-it__DeepSeek-R1": 5.975041937219203,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.7130654220370936,
"gemma-2b-it__databricks/dbrx-instruct": 1.221956545262271,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.19049162964039085,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 3.1079409977658807,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.510771627414416,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 3.811775076255697,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 4.223180387991099,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.0602009955466993,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.5299050039658342,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 2.9811959632452907,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 1.392797381544014,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 3.685030041735107,
"c4ai-command-r-08-2024__DeepSeek-R1": 4.096435353470509,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 1.1869460300672894,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.6566500384864242,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 1.6040350098931688,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.7038340784898163,
"gemini-1.5-pro-002__DeepSeek-R1": 1.1152393902252182,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 4.168141993312579,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 3.6378460017317154,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 2.3062335743642937,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 2.7176388860996954,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 2.565742497438103,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 2.0354465058572377,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.41140531173540207,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 4.871976071802397,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 4.341680080221532,
"DeepSeek-R1__gpt-3.5-turbo-0125": 5.283381383537798,
"DeepSeek-R1__databricks/dbrx-instruct": 4.753085391956933,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.6192081254720785
}
},
"average_ci95": 0.2269053680437684,
"modulated_ci95": 0.5753452369754639
}
},
"calibrated_score_range": 5.975041937219203,
"final_judgemark_score": 0.776377682322901,
"iteration_stability": {
"raw": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 7.253916666666667,
"iteration_count": 5,
"stdev_across_iters": 0.24714123829637719
},
"claude-3-haiku-20240307": {
"mean_iter_score": 5.440166666666666,
"iteration_count": 5,
"stdev_across_iters": 0.11475172475101787
},
"claude-3-opus-20240229": {
"mean_iter_score": 6.898833333333333,
"iteration_count": 5,
"stdev_across_iters": 0.21689209426706993
},
"gemini-1.5-pro-001": {
"mean_iter_score": 7.432833333333333,
"iteration_count": 5,
"stdev_across_iters": 0.17680053890063657
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 5.424833333333333,
"iteration_count": 5,
"stdev_across_iters": 0.12028224908291511
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 4.427833333333333,
"iteration_count": 5,
"stdev_across_iters": 0.12307529312489068
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 3.739916666666667,
"iteration_count": 5,
"stdev_across_iters": 0.12002962597255368
},
"gemma-7b-it": {
"mean_iter_score": 3.878416666666667,
"iteration_count": 5,
"stdev_across_iters": 0.047891225129917475
},
"gemma-2b-it": {
"mean_iter_score": 3.30925,
"iteration_count": 5,
"stdev_across_iters": 0.11814550821385929
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 4.696416666666667,
"iteration_count": 5,
"stdev_across_iters": 0.17483277724728843
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 4.84384420289855,
"iteration_count": 5,
"stdev_across_iters": 0.19733102718278958
},
"gemini-1.5-pro-002": {
"mean_iter_score": 7.792333333333334,
"iteration_count": 5,
"stdev_across_iters": 0.07233534943426892
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 6.125083333333333,
"iteration_count": 5,
"stdev_across_iters": 0.08460258401622385
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 8.422583333333334,
"iteration_count": 5,
"stdev_across_iters": 0.04998986008293242
},
"DeepSeek-R1": {
"mean_iter_score": 8.756583333333333,
"iteration_count": 5,
"stdev_across_iters": 0.04152425663258638
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 3.8186666666666667,
"iteration_count": 5,
"stdev_across_iters": 0.0938310390483282
},
"databricks/dbrx-instruct": {
"mean_iter_score": 4.220166666666667,
"iteration_count": 5,
"stdev_across_iters": 0.15619934734534316
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9852941176470588,
"p_value": 9.55895466477477e-14
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
}
},
"average_kendall_tau": 0.95
},
"randomized_average_kendall_tau_by_item": 0.9423823529411764
},
"calibrated": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 6.68195910819885,
"iteration_count": 5,
"stdev_across_iters": 0.22518462032271805
},
"claude-3-haiku-20240307": {
"mean_iter_score": 4.898806551514402,
"iteration_count": 5,
"stdev_across_iters": 0.11700203532787572
},
"claude-3-opus-20240229": {
"mean_iter_score": 6.31789005779875,
"iteration_count": 5,
"stdev_across_iters": 0.1920368687492576
},
"gemini-1.5-pro-001": {
"mean_iter_score": 6.827328927392509,
"iteration_count": 5,
"stdev_across_iters": 0.156247500769858
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 4.894965193794008,
"iteration_count": 5,
"stdev_across_iters": 0.16960401738835276
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 3.6586979153256927,
"iteration_count": 5,
"stdev_across_iters": 0.15164573520716812
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 2.8951801158578028,
"iteration_count": 5,
"stdev_across_iters": 0.15352809590880268
},
"gemma-7b-it": {
"mean_iter_score": 3.050149211896419,
"iteration_count": 5,
"stdev_across_iters": 0.03395333899002741
},
"gemma-2b-it": {
"mean_iter_score": 2.3021446954627556,
"iteration_count": 5,
"stdev_across_iters": 0.15050784987628207
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 4.054006244690861,
"iteration_count": 5,
"stdev_across_iters": 0.22526730176797122
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 4.184028792323408,
"iteration_count": 5,
"stdev_across_iters": 0.2155117657268275
},
"gemini-1.5-pro-002": {
"mean_iter_score": 7.161947242456741,
"iteration_count": 5,
"stdev_across_iters": 0.07324124461049421
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 5.559547746582264,
"iteration_count": 5,
"stdev_across_iters": 0.08149217123271334
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 7.8657813209465575,
"iteration_count": 5,
"stdev_across_iters": 0.048256567477833454
},
"DeepSeek-R1": {
"mean_iter_score": 8.277186632681959,
"iteration_count": 5,
"stdev_across_iters": 0.05267338070603484
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 2.993805249144161,
"iteration_count": 5,
"stdev_across_iters": 0.13530144304709857
},
"databricks/dbrx-instruct": {
"mean_iter_score": 3.524101240725026,
"iteration_count": 5,
"stdev_across_iters": 0.16625644613106771
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.9999999999999999,
"p_value": 5.622914508691041e-15
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9411764705882352,
"p_value": 2.628150241362193e-11
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
}
},
"average_kendall_tau": 0.95
},
"randomized_average_kendall_tau_by_item": 0.9390264705882352
}
},
"raw_score_range": 5.447333333333333,
"final_judgemark_score_raw": 0.7687595244646759,
"final_judgemark_score_elements_raw": {
"norm_stability_between_iterations": 0.9039705882352941,
"norm_correlation_with_lmsys_arena": 0.9150326797385621,
"norm_std_dev_between_models": 0.7816402974913039,
"norm_kruskall_wallis": 0.8910046185717154,
"norm_ci99_adjacent_overlap": 0.6963648765862853,
"norm_score_range": 0.6809166666666666,
"norm_intra_model_ci95": 0.6198884378310157,
"norm_earth_movers_distance": 0.5205159210743121
},
"final_judgemark_score_elements_calibrated": {
"norm_stability_between_iterations": 0.898377450980392,
"norm_correlation_with_lmsys_arena": 0.9150326797385621,
"norm_std_dev_between_models": 0.8319082605650683,
"norm_kruskall_wallis": 0.8910046185717154,
"norm_ci99_adjacent_overlap": 0.6647865930728065,
"norm_score_range": 0.7468802421524003,
"norm_intra_model_ci95": 0.5753452369754639,
"norm_earth_movers_distance": {
"pearson_r": 0.9025408092781012,
"kendall_tau": 0.9150326797385621,
"anova_f": 0.7067048080967009,
"kw_stat": 0.8910046185717154,
"std_dev": 0.8319082605650683,
"ci99_overlap_magnitude_sum_norm": 0.6647865930728065,
"calibrated_score_range_norm": 0.7468802421524003,
"kendall_tau_bootstrapped": 0.898377450980392
}
}
}