Files
Judgemark-v2lp/results/stats/mistralai__mistral-small-24b-instruct-2501.json
T
2025-01-31 18:03:33 +11:00

1128 lines
56 KiB
JSON

{
"judge_model": "mistralai/mistral-small-24b-instruct-2501",
"start_time": "2025-01-31T06:38:32.571438",
"status": "completed",
"samples_file": "data/judgemark_v2.1_samples.json",
"prompts_file": "data/judge_prompts.json",
"end_time": "2025-01-31T15:26:37.858165",
"raw_score_distribution": {
"count": 2040,
"min": 2.14,
"max": 9.25,
"mean": 6.44,
"median": 6.86,
"stdev": 1.402,
"p10": 4.317,
"p25": 5.353,
"p75": 7.54,
"p90": 7.93
},
"calibration_config": {
"method": "piecewise_landmark",
"in_landmarks": [
2.14,
5.3525,
6.86,
7.54,
9.25
],
"out_landmarks": [
0,
3,
5,
7,
10
]
},
"calibrated_score_distribution": {
"count": 2040,
"min": 0.0,
"max": 10.0,
"mean": 4.992,
"median": 5.0,
"stdev": 2.171,
"p10": 2.033,
"p25": 3.002,
"p75": 7.0,
"p90": 7.684
},
"raw_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 7.24575,
"median": 7.41,
"stdev": 0.7738853871452638,
"ci95": 0.1384656645979393,
"min": 5.11,
"max": 9.04,
"length_correlation": -0.002475742493860448
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 6.609083333333333,
"median": 6.855,
"stdev": 1.1022307372048812,
"ci95": 0.19721410185860297,
"min": 3.71,
"max": 8.43,
"length_correlation": -0.05299933662378288
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 7.157416666666666,
"median": 7.32,
"stdev": 0.8958864872511559,
"ci95": 0.16029443109029734,
"min": 4.54,
"max": 8.79,
"length_correlation": 0.04342860761188247
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 7.319916666666667,
"median": 7.32,
"stdev": 0.6277189541778818,
"ci95": 0.11231317145241401,
"min": 4.79,
"max": 8.61,
"length_correlation": 0.015567052515450219
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 6.7545,
"median": 7.0,
"stdev": 0.966008725189818,
"ci95": 0.17284089138086894,
"min": 4.57,
"max": 8.93,
"length_correlation": -0.12681174487591582
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 6.002583333333333,
"median": 6.105,
"stdev": 1.1443456137695034,
"ci95": 0.20474940937292613,
"min": 3.54,
"max": 8.04,
"length_correlation": -0.14756519095029053
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 5.44975,
"median": 5.34,
"stdev": 1.3126485270102524,
"ci95": 0.2348626213843519,
"min": 3.0,
"max": 8.68,
"length_correlation": 0.10194398980020855
},
"gemma-7b-it": {
"count": 120,
"mean": 4.991416666666667,
"median": 4.73,
"stdev": 1.2268846218573854,
"ci95": 0.2195175117301787,
"min": 2.39,
"max": 8.07,
"length_correlation": -0.034396362974259784
},
"gemma-2b-it": {
"count": 120,
"mean": 4.405166666666667,
"median": 4.125,
"stdev": 1.1517644362136776,
"ci95": 0.20607680513117407,
"min": 2.18,
"max": 8.0,
"length_correlation": -0.014322592699298373
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 6.364333333333334,
"median": 6.66,
"stdev": 1.1948972272766814,
"ci95": 0.2137942406580731,
"min": 2.61,
"max": 8.39,
"length_correlation": -0.10731097065943129
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 6.261166666666667,
"median": 6.64,
"stdev": 1.1901699499667775,
"ci95": 0.2129484234281223,
"min": 3.68,
"max": 8.14,
"length_correlation": 0.20934946488094278
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 7.417833333333333,
"median": 7.529999999999999,
"stdev": 0.6156422940098166,
"ci95": 0.1101523827825787,
"min": 4.96,
"max": 8.61,
"length_correlation": -0.12151998145182114
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 6.96125,
"median": 7.09,
"stdev": 0.9717000284954979,
"ci95": 0.17385919474689632,
"min": 3.32,
"max": 8.82,
"length_correlation": 0.009690079063523201
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 7.570666666666667,
"median": 7.68,
"stdev": 0.6572756777232903,
"ci95": 0.11760154029492369,
"min": 5.32,
"max": 8.86,
"length_correlation": 0.28105412927410267
},
"DeepSeek-R1": {
"count": 120,
"mean": 7.841583333333333,
"median": 7.86,
"stdev": 0.6271584521391989,
"ci95": 0.11221288491311002,
"min": 5.82,
"max": 9.25,
"length_correlation": 0.025649596842388763
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 5.1810833333333335,
"median": 5.055,
"stdev": 1.1602490284948526,
"ci95": 0.20759489130849443,
"min": 2.29,
"max": 8.46,
"length_correlation": -0.2663994108328025
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 5.94425,
"median": 6.07,
"stdev": 1.2512297564241928,
"ci95": 0.22387340899030678,
"min": 2.14,
"max": 8.0,
"length_correlation": -0.20653971830241444
}
},
"calibrated_model_stats": {
"claude-3-5-sonnet-20240620": {
"count": 120,
"mean": 6.24341110428404,
"median": 6.617647058823529,
"stdev": 1.513914202721647,
"ci95": 0.270873619926311,
"min": 2.7735408560311283,
"max": 9.63157894736842,
"length_correlation": -0.026463093191221542
},
"claude-3-haiku-20240307": {
"count": 120,
"mean": 5.119678976573259,
"median": 5.017583650375572,
"stdev": 1.814084891038744,
"ci95": 0.32458097057673235,
"min": 1.4661478599221789,
"max": 8.56140350877193,
"length_correlation": -0.07865140111152846
},
"claude-3-opus-20240229": {
"count": 120,
"mean": 6.091662461412351,
"median": 6.352941176470589,
"stdev": 1.6768172145070779,
"ci95": 0.3000206669781784,
"min": 2.2412451361867705,
"max": 9.19298245614035,
"length_correlation": 0.025930644332760197
},
"gemini-1.5-pro-001": {
"count": 120,
"mean": 6.328679541671164,
"median": 6.352941176470589,
"stdev": 1.3116010422863964,
"ci95": 0.23467520258713187,
"min": 2.4747081712062258,
"max": 8.87719298245614,
"length_correlation": 6.980625051840469e-05
},
"Llama-3-70b-chat-hf": {
"count": 120,
"mean": 5.321506521636969,
"median": 5.411764705882352,
"stdev": 1.7097698147710712,
"ci95": 0.30591663525923807,
"min": 2.2692607003891054,
"max": 9.438596491228068,
"length_correlation": -0.17078683377698892
},
"Mixtral-8x7B-Instruct-v0.1": {
"count": 120,
"mean": 4.152091731414124,
"median": 3.9983416252072965,
"stdev": 1.6761450332029226,
"ci95": 0.29990039848292555,
"min": 1.3073929961089494,
"max": 7.877192982456139,
"length_correlation": -0.11736501907569356
},
"Llama-2-13b-chat-hf": {
"count": 120,
"mean": 3.4758824550941814,
"median": 2.989800027101845,
"stdev": 1.820297766680272,
"ci95": 0.3256925950744398,
"min": 0.8031128404669261,
"max": 9.0,
"length_correlation": 0.10778435780550603
},
"gemma-7b-it": {
"count": 120,
"mean": 2.8834912622787794,
"median": 2.4186770428015567,
"stdev": 1.582519013935895,
"ci95": 0.2831485781270838,
"min": 0.23346303501945526,
"max": 7.929824561403509,
"length_correlation": -0.014795831169776529
},
"gemma-2b-it": {
"count": 120,
"mean": 2.222853941516765,
"median": 1.8536964980544746,
"stdev": 1.352177893315551,
"ci95": 0.24193532241672216,
"min": 0.037354085603112874,
"max": 7.807017543859649,
"length_correlation": -0.00626006886845542
},
"Mixtral-8x22B-Instruct-v0.1": {
"count": 120,
"mean": 4.74222309343619,
"median": 4.734660033167495,
"stdev": 1.8597559926272333,
"ci95": 0.3327525674816667,
"min": 0.43891050583657565,
"max": 8.49122807017544,
"length_correlation": -0.11281384823163548
},
"c4ai-command-r-08-2024": {
"count": 120,
"mean": 4.561574143297052,
"median": 4.708126036484245,
"stdev": 1.809207009828139,
"ci95": 0.3237082068899185,
"min": 1.4381322957198444,
"max": 8.05263157894737,
"length_correlation": 0.1795645363776789
},
"gemini-1.5-pro-002": {
"count": 120,
"mean": 6.5732934468467645,
"median": 6.970588235294117,
"stdev": 1.2699040160768762,
"ci95": 0.22721465798742466,
"min": 2.6334630350194552,
"max": 8.87719298245614,
"length_correlation": -0.1257748777008509
},
"Mistral-Large-Instruct-2411": {
"count": 120,
"mean": 5.691325221179402,
"median": 5.676470588235294,
"stdev": 1.6619707523199512,
"ci95": 0.2973642978467597,
"min": 1.1019455252918284,
"max": 9.24561403508772,
"length_correlation": 0.017810969750636504
},
"gpt-4o-2024-11-20": {
"count": 120,
"mean": 6.873329557318088,
"median": 7.245614035087719,
"stdev": 1.3590909910144606,
"ci95": 0.24317223253701922,
"min": 2.969649805447471,
"max": 9.315789473684209,
"length_correlation": 0.2908432942183641
},
"DeepSeek-R1": {
"count": 120,
"mean": 7.416395148440802,
"median": 7.561403508771931,
"stdev": 1.2681644766715765,
"ci95": 0.2269034148965871,
"min": 3.6202321724709785,
"max": 10.0,
"length_correlation": 0.03010622668942908
},
"gpt-3.5-turbo-0125": {
"count": 120,
"mean": 3.059306085973714,
"median": 2.7221789883268483,
"stdev": 1.5069732482219282,
"ci95": 0.2696317255919408,
"min": 0.14007782101167307,
"max": 8.6140350877193,
"length_correlation": -0.24411670522128445
},
"databricks/dbrx-instruct": {
"count": 120,
"mean": 4.103878272900874,
"median": 3.951907131011609,
"stdev": 1.7688936473493926,
"ci95": 0.31649523114374456,
"min": 0.0,
"max": 7.807017543859649,
"length_correlation": -0.18590488060014362
}
},
"raw_cross_model_stats": {
"anova_f": 113.84480855592682,
"anova_p": 1.0572637739373464e-267,
"kw_stat": 926.6659009318737,
"kw_p": 5.526814934051557e-187,
"std_dev_across_models": 0.9645355345624542,
"pearson_r": 0.9360187342643919,
"kendall_tau": 0.8970588235294117,
"normalized_components": {
"pearson_r": 0.7867291142146395,
"kendall_tau": 0.8856209150326797,
"anova_f": 0.3252708815883623,
"kw_stat": 0.6177772672879158,
"std_dev": 0.4384252429829337,
"ci99_overlap_magnitude_sum_norm": 0.6986015831245652,
"raw_score_range_norm": 0.4295520833333333,
"kendall_tau_bootstrapped": 0.8133088235294117
}
},
"calibrated_cross_model_stats": {
"anova_f": 108.5092187521208,
"anova_p": 6.47520439044529e-258,
"kw_stat": 926.6659009318737,
"kw_p": 5.526814934051557e-187,
"std_dev_across_models": 1.4749530584286665,
"pearson_r": 0.9546474467408973,
"kendall_tau": 0.9029411764705882,
"normalized_components": {
"pearson_r": 0.8488248224696577,
"kendall_tau": 0.8921568627450981,
"anova_f": 0.3100263392917737,
"kw_stat": 0.6177772672879158,
"std_dev": 0.6704332083766665,
"ci99_overlap_magnitude_sum_norm": 0.5037974412199329,
"calibrated_score_range_norm": 0.6491926508655046,
"kendall_tau_bootstrapped": 0.807892156862745
}
},
"separability_metrics": {
"raw": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": true,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": true,
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": false
},
"adjacent_overlap_fraction": 0.9375,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.1821160354984599,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.29613750523395055,
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.34062922131451323,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.42019313532954694,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.5006118267724062,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.4625496863832508,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": 0.47669916432971426,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.5840719778569223,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5654698974419787,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.738070266415793,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.5648235425017241,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.7866099439955452,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": 0.4098053900250598,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.6035489150447564,
"gpt-3.5-turbo-0125__gemma-7b-it": 0.6522991320498139,
"gemma-7b-it__gemma-2b-it": 0.2527231985678693
},
"ci99_overlap_magnitude_sum": 7.836358838761305,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.21447235149309962,
"emd": {
"average": 1.1634509803921576,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 0.6366666666666666,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.12666666666666662,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.12349999999999997,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.49175,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 1.2431666666666665,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 1.7959999999999998,
"claude-3-5-sonnet-20240620__gemma-7b-it": 2.2543333333333333,
"claude-3-5-sonnet-20240620__gemma-2b-it": 2.840583333333334,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 0.8814166666666667,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 0.9845833333333336,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.1859166666666667,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.28633333333333333,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.3297500000000001,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.5958333333333333,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 2.0646666666666667,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 1.3014999999999999,
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.5483333333333333,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 0.7108333333333332,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.1615833333333333,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.6065,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.1635,
"claude-3-haiku-20240307__gemma-7b-it": 1.6176666666666668,
"claude-3-haiku-20240307__gemma-2b-it": 2.2039166666666667,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.24475000000000002,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.34858333333333336,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 0.8087500000000001,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.3695,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 0.9615833333333333,
"claude-3-haiku-20240307__DeepSeek-R1": 1.2325,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.4285,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.6648333333333333,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.2078333333333333,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.40575,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 1.1548333333333334,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 1.7076666666666667,
"claude-3-opus-20240229__gemma-7b-it": 2.1660000000000004,
"claude-3-opus-20240229__gemma-2b-it": 2.75225,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.7930833333333334,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 0.89625,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.27491666666666664,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.19966666666666666,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.4139166666666667,
"claude-3-opus-20240229__DeepSeek-R1": 0.6841666666666668,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 1.9763333333333333,
"claude-3-opus-20240229__databricks/dbrx-instruct": 1.2131666666666667,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 0.57075,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 1.3173333333333335,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 1.8713333333333333,
"gemini-1.5-pro-001__gemma-7b-it": 2.3285,
"gemini-1.5-pro-001__gemma-2b-it": 2.9147499999999997,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 0.9555833333333333,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 1.0587500000000003,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.1090833333333334,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.3653333333333333,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.2525833333333334,
"gemini-1.5-pro-001__DeepSeek-R1": 0.5216666666666667,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 2.1388333333333334,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 1.3756666666666666,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.7519166666666666,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.3047499999999999,
"Llama-3-70b-chat-hf__gemma-7b-it": 1.7630833333333333,
"Llama-3-70b-chat-hf__gemma-2b-it": 2.3493333333333335,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.39016666666666666,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.4933333333333333,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 0.6686666666666667,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.27341666666666664,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 0.8173333333333334,
"Llama-3-70b-chat-hf__DeepSeek-R1": 1.0870833333333332,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.5734166666666667,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.8102499999999999,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.5706666666666667,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.0146666666666668,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.5974166666666667,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.3814166666666666,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.2680833333333333,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 1.41525,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.9653333333333333,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 1.5680833333333333,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 1.8390000000000002,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.8350000000000002,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.08966666666666662,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.46033333333333337,
"Llama-2-13b-chat-hf__gemma-2b-it": 1.0445833333333332,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.9259166666666667,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8204166666666666,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 1.9692499999999997,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.5115,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 2.120916666666667,
"Llama-2-13b-chat-hf__DeepSeek-R1": 2.3918333333333335,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.31633333333333336,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.5248333333333334,
"gemma-7b-it__gemma-2b-it": 0.58625,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.3729166666666668,
"gemma-7b-it__c4ai-command-r-08-2024": 1.26975,
"gemma-7b-it__gemini-1.5-pro-002": 2.4264166666666664,
"gemma-7b-it__Mistral-Large-Instruct-2411": 1.9698333333333333,
"gemma-7b-it__gpt-4o-2024-11-20": 2.57925,
"gemma-7b-it__DeepSeek-R1": 2.8501666666666665,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.2555,
"gemma-7b-it__databricks/dbrx-instruct": 0.9606666666666668,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 1.9591666666666665,
"gemma-2b-it__c4ai-command-r-08-2024": 1.8559999999999999,
"gemma-2b-it__gemini-1.5-pro-002": 3.012666666666667,
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.556083333333333,
"gemma-2b-it__gpt-4o-2024-11-20": 3.1654999999999998,
"gemma-2b-it__DeepSeek-R1": 3.4364166666666662,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.7759166666666666,
"gemma-2b-it__databricks/dbrx-instruct": 1.53975,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.14083333333333337,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.0535,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.5969166666666667,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 1.2063333333333333,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 1.4772500000000002,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1855833333333334,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.42074999999999996,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 1.1566666666666667,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.71275,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 1.3095,
"c4ai-command-r-08-2024__DeepSeek-R1": 1.5804166666666668,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 1.08725,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.31691666666666657,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.4600833333333333,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.15399999999999997,
"gemini-1.5-pro-002__DeepSeek-R1": 0.42374999999999996,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 2.23675,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 1.4735833333333332,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 0.6094166666666667,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 0.8803333333333334,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.7801666666666667,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.017,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.2709166666666667,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 2.3895833333333334,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 1.6264166666666666,
"DeepSeek-R1__gpt-3.5-turbo-0125": 2.6605,
"DeepSeek-R1__databricks/dbrx-instruct": 1.8973333333333335,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.7793333333333335
}
},
"average_ci95": 0.17755126912477992,
"modulated_ci95": 0.48693933310099247
},
"calibrated": {
"ci99_overlap_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": true,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": true,
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": true,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": true,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": true,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": true,
"gpt-3.5-turbo-0125__gemma-7b-it": true,
"gemma-7b-it__gemma-2b-it": true
},
"adjacent_overlap_fraction": 1.0,
"ci99_overlap_magnitude_adjacent": {
"DeepSeek-R1__gpt-4o-2024-11-20": 0.38359356261009303,
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.6272365961241686,
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.6659086238142393,
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.911318913177749,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.9736540867996668,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.7772865081565445,
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": 0.8194277589525303,
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 1.0410710842192366,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.9183443639570381,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 1.1134308197001967,
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.819835945003272,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 1.166885974169289,
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": 0.6379477187916502,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.7569852356916282,
"gpt-3.5-turbo-0125__gemma-7b-it": 0.9138798483031199,
"gemma-7b-it__gemma-2b-it": 0.3744594888113215
},
"ci99_overlap_magnitude_sum": 12.901266528281745,
"ci99_overlap_scale_factor": 1.5,
"average_cohens_d_adjacent": 0.20637131038030193,
"emd": {
"average": 1.8020993221045423,
"pairs": {
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 1.1237321277107815,
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.21979129853044369,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.1981007193203076,
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.9227817756295282,
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 2.0913193728699158,
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 2.7675286491898587,
"claude-3-5-sonnet-20240620__gemma-7b-it": 3.3599198420052607,
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.020557162767275,
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 1.5011880108478501,
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 1.6818369609869883,
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.35210001384859124,
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.5553022573736441,
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6383979851977899,
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.172984044156762,
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.184105018310327,
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 2.1395328313831667,
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.9719834848390921,
"claude-3-haiku-20240307__gemini-1.5-pro-001": 1.2090005650979048,
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.23135718444848474,
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.9675872451591345,
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.6511064629995456,
"claude-3-haiku-20240307__gemma-7b-it": 2.2361877142944797,
"claude-3-haiku-20240307__gemma-2b-it": 2.8968250350564935,
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.37745588313706874,
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.5592744239194818,
"claude-3-haiku-20240307__gemini-1.5-pro-002": 1.4536144702735054,
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5878330150341587,
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 1.7536505807448286,
"claude-3-haiku-20240307__DeepSeek-R1": 2.2967161718675433,
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 2.0612500835820016,
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.015800703672385,
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.32347218861794563,
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.7747164330968839,
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 1.9395707299982266,
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.6157800063181695,
"claude-3-opus-20240229__gemma-7b-it": 3.208171199133572,
"claude-3-opus-20240229__gemma-2b-it": 3.8688085198955866,
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.349439367976161,
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.5300883181152996,
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.5070695819256412,
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.40519453271789274,
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.7828366865490113,
"claude-3-opus-20240229__DeepSeek-R1": 1.324732687028451,
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 3.0323563754386376,
"claude-3-opus-20240229__databricks/dbrx-instruct": 1.987784188511477,
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.0165297451803943,
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 2.176587810257039,
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 2.8548438702027132,
"gemini-1.5-pro-001__gemma-7b-it": 3.4451882793923843,
"gemini-1.5-pro-001__gemma-2b-it": 4.1058256001543985,
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 1.5864564482349737,
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 1.7671053983741118,
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.2607117783826648,
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.6490502269245101,
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.5470822986762216,
"gemini-1.5-pro-001__DeepSeek-R1": 1.0877156067696385,
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.2693734556974503,
"gemini-1.5-pro-001__databricks/dbrx-instruct": 2.22480126877029,
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1694147902228438,
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.845624066542787,
"Llama-3-70b-chat-hf__gemma-7b-it": 2.438015259358189,
"Llama-3-70b-chat-hf__gemma-2b-it": 3.0986525801202034,
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.579283428200778,
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.7599323783399162,
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.261143650355995,
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.43399008780556897,
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 1.55386981930685,
"Llama-3-70b-chat-hf__DeepSeek-R1": 2.094888626803834,
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 2.262200435663255,
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.2176282487360948,
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7082870196979961,
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2747408200125383,
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.9292377898973596,
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.6084971207769296,
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4183540072136669,
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 2.42120171543264,
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.54545917069913,
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 2.7212378259039633,
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 3.264303417026678,
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.116469855966727,
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.1101479381617914,
"Llama-2-13b-chat-hf__gemma-7b-it": 0.595352942195337,
"Llama-2-13b-chat-hf__gemma-2b-it": 1.2530285135774162,
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2808902094162575,
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 1.101481161887081,
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.099457775378314,
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 2.215442766085221,
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 3.397447102223906,
"Llama-2-13b-chat-hf__DeepSeek-R1": 3.940512693346621,
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.4650559012842105,
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.6708357961763596,
"gemma-7b-it__gemma-2b-it": 0.6606373207620142,
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.858731831157411,
"gemma-7b-it__c4ai-command-r-08-2024": 1.6780828810182729,
"gemma-7b-it__gemini-1.5-pro-002": 3.689802184567985,
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.807833958900623,
"gemma-7b-it__gpt-4o-2024-11-20": 3.9898382950393083,
"gemma-7b-it__DeepSeek-R1": 4.532903886162023,
"gemma-7b-it__gpt-3.5-turbo-0125": 0.2703306401266234,
"gemma-7b-it__databricks/dbrx-instruct": 1.2307108097437638,
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.5193691519194252,
"gemma-2b-it__c4ai-command-r-08-2024": 2.3387202017802866,
"gemma-2b-it__gemini-1.5-pro-002": 4.350439505329999,
"gemma-2b-it__Mistral-Large-Instruct-2411": 3.4684712796626376,
"gemma-2b-it__gpt-4o-2024-11-20": 4.650475615801322,
"gemma-2b-it__DeepSeek-R1": 5.193541206924037,
"gemma-2b-it__gpt-3.5-turbo-0125": 0.8364521444569485,
"gemma-2b-it__databricks/dbrx-instruct": 1.8816468994774942,
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.22192579344666363,
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.8310703534105746,
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.9491021277432119,
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 2.1311064638818973,
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 2.674172055004612,
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.6870105747139386,
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.6389673886287017,
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 2.0117193035497127,
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 1.1415798716566692,
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 2.311755414021036,
"c4ai-command-r-08-2024__DeepSeek-R1": 2.8548210051437506,
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 1.5148411567385431,
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.4576958703961783,
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.8881085765445554,
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.30137113444355,
"gemini-1.5-pro-002__DeepSeek-R1": 0.8431017015940377,
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.5139873608730507,
"gemini-1.5-pro-002__databricks/dbrx-instruct": 2.4694151739458903,
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 1.1820043361386854,
"Mistral-Large-Instruct-2411__DeepSeek-R1": 1.7250699272614,
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 2.632019135205689,
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.5874469482785283,
"gpt-4o-2024-11-20__DeepSeek-R1": 0.5430655911227148,
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 3.814023471344374,
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 2.7694512844172134,
"DeepSeek-R1__gpt-3.5-turbo-0125": 4.357089062467089,
"DeepSeek-R1__databricks/dbrx-instruct": 3.3125168755399286,
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 1.0708834254644892
}
},
"average_ci95": 0.28376390140022495,
"modulated_ci95": 0.2858863458378721
}
},
"calibrated_score_range": 5.193541206924037,
"iteration_stability": {
"raw": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 7.24575,
"iteration_count": 5,
"stdev_across_iters": 0.15694443461160262
},
"claude-3-haiku-20240307": {
"mean_iter_score": 6.609083333333333,
"iteration_count": 5,
"stdev_across_iters": 0.17636519088401648
},
"claude-3-opus-20240229": {
"mean_iter_score": 7.157416666666666,
"iteration_count": 5,
"stdev_across_iters": 0.1706436752091587
},
"gemini-1.5-pro-001": {
"mean_iter_score": 7.319916666666667,
"iteration_count": 5,
"stdev_across_iters": 0.1319146925520852
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 6.7545,
"iteration_count": 5,
"stdev_across_iters": 0.17172761862903718
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 6.002583333333334,
"iteration_count": 5,
"stdev_across_iters": 0.21693755501936
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 5.44975,
"iteration_count": 5,
"stdev_across_iters": 0.12699316473294484
},
"gemma-7b-it": {
"mean_iter_score": 4.991416666666667,
"iteration_count": 5,
"stdev_across_iters": 0.04307261310856343
},
"gemma-2b-it": {
"mean_iter_score": 4.405166666666666,
"iteration_count": 5,
"stdev_across_iters": 0.14251671441935812
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 6.364333333333334,
"iteration_count": 5,
"stdev_across_iters": 0.22905824708042175
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 6.261166666666667,
"iteration_count": 5,
"stdev_across_iters": 0.13324043638809097
},
"gemini-1.5-pro-002": {
"mean_iter_score": 7.417833333333333,
"iteration_count": 5,
"stdev_across_iters": 0.06779841443573752
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 6.96125,
"iteration_count": 5,
"stdev_across_iters": 0.15568330567747668
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 7.570666666666667,
"iteration_count": 5,
"stdev_across_iters": 0.042639965603488525
},
"DeepSeek-R1": {
"mean_iter_score": 7.841583333333333,
"iteration_count": 5,
"stdev_across_iters": 0.0666126864793788
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 5.1810833333333335,
"iteration_count": 5,
"stdev_across_iters": 0.19121007208477964
},
"databricks/dbrx-instruct": {
"mean_iter_score": 5.94425,
"iteration_count": 5,
"stdev_across_iters": 0.21119491392234477
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.8823529411764705,
"p_value": 3.5743855407137387e-09
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.926470588235294,
"p_value": 1.080161877119549e-10
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.8382352941176471,
"p_value": 5.634316092440314e-08
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8676470588235293,
"p_value": 9.575975226992579e-09
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
}
},
"average_kendall_tau": 0.8941176470588235
},
"randomized_average_kendall_tau_by_item": 0.887985294117647
},
"calibrated": {
"scoring_stability": {
"claude-3-5-sonnet-20240620": {
"mean_iter_score": 6.24341110428404,
"iteration_count": 5,
"stdev_across_iters": 0.302565394802204
},
"claude-3-haiku-20240307": {
"mean_iter_score": 5.119678976573259,
"iteration_count": 5,
"stdev_across_iters": 0.25665401866270493
},
"claude-3-opus-20240229": {
"mean_iter_score": 6.091662461412351,
"iteration_count": 5,
"stdev_across_iters": 0.3447684735223457
},
"gemini-1.5-pro-001": {
"mean_iter_score": 6.328679541671164,
"iteration_count": 5,
"stdev_across_iters": 0.23641743672897256
},
"Llama-3-70b-chat-hf": {
"mean_iter_score": 5.321506521636969,
"iteration_count": 5,
"stdev_across_iters": 0.2952207909827205
},
"Mixtral-8x7B-Instruct-v0.1": {
"mean_iter_score": 4.152091731414124,
"iteration_count": 5,
"stdev_across_iters": 0.2792172386805539
},
"Llama-2-13b-chat-hf": {
"mean_iter_score": 3.4758824550941814,
"iteration_count": 5,
"stdev_across_iters": 0.21334784570102575
},
"gemma-7b-it": {
"mean_iter_score": 2.8834912622787794,
"iteration_count": 5,
"stdev_across_iters": 0.0282201567909068
},
"gemma-2b-it": {
"mean_iter_score": 2.222853941516765,
"iteration_count": 5,
"stdev_across_iters": 0.13719101349050422
},
"Mixtral-8x22B-Instruct-v0.1": {
"mean_iter_score": 4.74222309343619,
"iteration_count": 5,
"stdev_across_iters": 0.2722477982658716
},
"c4ai-command-r-08-2024": {
"mean_iter_score": 4.561574143297053,
"iteration_count": 5,
"stdev_across_iters": 0.19390065298660353
},
"gemini-1.5-pro-002": {
"mean_iter_score": 6.5732934468467645,
"iteration_count": 5,
"stdev_across_iters": 0.10303097674767771
},
"Mistral-Large-Instruct-2411": {
"mean_iter_score": 5.691325221179402,
"iteration_count": 5,
"stdev_across_iters": 0.19681980003611643
},
"gpt-4o-2024-11-20": {
"mean_iter_score": 6.873329557318088,
"iteration_count": 5,
"stdev_across_iters": 0.0825027225675669
},
"DeepSeek-R1": {
"mean_iter_score": 7.416395148440802,
"iteration_count": 5,
"stdev_across_iters": 0.13649461417547062
},
"gpt-3.5-turbo-0125": {
"mean_iter_score": 3.059306085973714,
"iteration_count": 5,
"stdev_across_iters": 0.2162664674561692
},
"databricks/dbrx-instruct": {
"mean_iter_score": 4.103878272900874,
"iteration_count": 5,
"stdev_across_iters": 0.34399539323170275
}
},
"ranking_stability": {
"pairwise_correlation": {
"1__vs__2": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"1__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"1__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
},
"1__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9705882352941175,
"p_value": 8.546830053210383e-13
},
"2__vs__3": {
"common_model_count": 17,
"kendall_tau": 0.8235294117647057,
"p_value": 1.25716599654265e-07
},
"2__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.8823529411764705,
"p_value": 3.5743855407137387e-09
},
"2__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"3__vs__4": {
"common_model_count": 17,
"kendall_tau": 0.9117647058823529,
"p_value": 3.8599058936360526e-10
},
"3__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.8970588235294118,
"p_value": 1.2313901628307946e-09
},
"4__vs__5": {
"common_model_count": 17,
"kendall_tau": 0.9558823529411764,
"p_value": 5.347391697765181e-12
}
},
"average_kendall_tau": 0.9088235294117647
},
"randomized_average_kendall_tau_by_item": 0.884735294117647
}
},
"final_judgemark_score": 0.6364094748361991,
"raw_score_range": 3.4364166666666662,
"final_judgemark_score_raw": 0.6122836514189908,
"final_judgemark_score_elements_raw": {
"norm_stability_between_iterations": 0.8133088235294117,
"norm_correlation_with_lmsys_arena": 0.8856209150326797,
"norm_std_dev_between_models": 0.4384252429829337,
"norm_kruskall_wallis": 0.6177772672879158,
"norm_ci99_adjacent_overlap": 0.6986015831245652,
"norm_score_range": 0.4295520833333333,
"norm_intra_model_ci95": 0.48693933310099247,
"norm_earth_movers_distance": 0.2908627450980394
},
"final_judgemark_score_elements_calibrated": {
"norm_stability_between_iterations": 0.807892156862745,
"norm_correlation_with_lmsys_arena": 0.8921568627450981,
"norm_std_dev_between_models": 0.6704332083766665,
"norm_kruskall_wallis": 0.6177772672879158,
"norm_ci99_adjacent_overlap": 0.5037974412199329,
"norm_score_range": 0.6491926508655046,
"norm_intra_model_ci95": 0.2858863458378721,
"norm_earth_movers_distance": {
"pearson_r": 0.8488248224696577,
"kendall_tau": 0.8921568627450981,
"anova_f": 0.3100263392917737,
"kw_stat": 0.6177772672879158,
"std_dev": 0.6704332083766665,
"ci99_overlap_magnitude_sum_norm": 0.5037974412199329,
"calibrated_score_range_norm": 0.6491926508655046,
"kendall_tau_bootstrapped": 0.807892156862745
}
}
}