mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
1128 lines
56 KiB
JSON
1128 lines
56 KiB
JSON
{
|
|
"judge_model": "mistralai/mistral-small-24b-instruct-2501",
|
|
"start_time": "2025-01-31T06:38:32.571438",
|
|
"status": "completed",
|
|
"samples_file": "data/judgemark_v2.1_samples.json",
|
|
"prompts_file": "data/judge_prompts.json",
|
|
"end_time": "2025-01-31T15:26:37.858165",
|
|
"raw_score_distribution": {
|
|
"count": 2040,
|
|
"min": 2.14,
|
|
"max": 9.25,
|
|
"mean": 6.44,
|
|
"median": 6.86,
|
|
"stdev": 1.402,
|
|
"p10": 4.317,
|
|
"p25": 5.353,
|
|
"p75": 7.54,
|
|
"p90": 7.93
|
|
},
|
|
"calibration_config": {
|
|
"method": "piecewise_landmark",
|
|
"in_landmarks": [
|
|
2.14,
|
|
5.3525,
|
|
6.86,
|
|
7.54,
|
|
9.25
|
|
],
|
|
"out_landmarks": [
|
|
0,
|
|
3,
|
|
5,
|
|
7,
|
|
10
|
|
]
|
|
},
|
|
"calibrated_score_distribution": {
|
|
"count": 2040,
|
|
"min": 0.0,
|
|
"max": 10.0,
|
|
"mean": 4.992,
|
|
"median": 5.0,
|
|
"stdev": 2.171,
|
|
"p10": 2.033,
|
|
"p25": 3.002,
|
|
"p75": 7.0,
|
|
"p90": 7.684
|
|
},
|
|
"raw_model_stats": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"count": 120,
|
|
"mean": 7.24575,
|
|
"median": 7.41,
|
|
"stdev": 0.7738853871452638,
|
|
"ci95": 0.1384656645979393,
|
|
"min": 5.11,
|
|
"max": 9.04,
|
|
"length_correlation": -0.002475742493860448
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"count": 120,
|
|
"mean": 6.609083333333333,
|
|
"median": 6.855,
|
|
"stdev": 1.1022307372048812,
|
|
"ci95": 0.19721410185860297,
|
|
"min": 3.71,
|
|
"max": 8.43,
|
|
"length_correlation": -0.05299933662378288
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"count": 120,
|
|
"mean": 7.157416666666666,
|
|
"median": 7.32,
|
|
"stdev": 0.8958864872511559,
|
|
"ci95": 0.16029443109029734,
|
|
"min": 4.54,
|
|
"max": 8.79,
|
|
"length_correlation": 0.04342860761188247
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"count": 120,
|
|
"mean": 7.319916666666667,
|
|
"median": 7.32,
|
|
"stdev": 0.6277189541778818,
|
|
"ci95": 0.11231317145241401,
|
|
"min": 4.79,
|
|
"max": 8.61,
|
|
"length_correlation": 0.015567052515450219
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 6.7545,
|
|
"median": 7.0,
|
|
"stdev": 0.966008725189818,
|
|
"ci95": 0.17284089138086894,
|
|
"min": 4.57,
|
|
"max": 8.93,
|
|
"length_correlation": -0.12681174487591582
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 6.002583333333333,
|
|
"median": 6.105,
|
|
"stdev": 1.1443456137695034,
|
|
"ci95": 0.20474940937292613,
|
|
"min": 3.54,
|
|
"max": 8.04,
|
|
"length_correlation": -0.14756519095029053
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 5.44975,
|
|
"median": 5.34,
|
|
"stdev": 1.3126485270102524,
|
|
"ci95": 0.2348626213843519,
|
|
"min": 3.0,
|
|
"max": 8.68,
|
|
"length_correlation": 0.10194398980020855
|
|
},
|
|
"gemma-7b-it": {
|
|
"count": 120,
|
|
"mean": 4.991416666666667,
|
|
"median": 4.73,
|
|
"stdev": 1.2268846218573854,
|
|
"ci95": 0.2195175117301787,
|
|
"min": 2.39,
|
|
"max": 8.07,
|
|
"length_correlation": -0.034396362974259784
|
|
},
|
|
"gemma-2b-it": {
|
|
"count": 120,
|
|
"mean": 4.405166666666667,
|
|
"median": 4.125,
|
|
"stdev": 1.1517644362136776,
|
|
"ci95": 0.20607680513117407,
|
|
"min": 2.18,
|
|
"max": 8.0,
|
|
"length_correlation": -0.014322592699298373
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 6.364333333333334,
|
|
"median": 6.66,
|
|
"stdev": 1.1948972272766814,
|
|
"ci95": 0.2137942406580731,
|
|
"min": 2.61,
|
|
"max": 8.39,
|
|
"length_correlation": -0.10731097065943129
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"count": 120,
|
|
"mean": 6.261166666666667,
|
|
"median": 6.64,
|
|
"stdev": 1.1901699499667775,
|
|
"ci95": 0.2129484234281223,
|
|
"min": 3.68,
|
|
"max": 8.14,
|
|
"length_correlation": 0.20934946488094278
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"count": 120,
|
|
"mean": 7.417833333333333,
|
|
"median": 7.529999999999999,
|
|
"stdev": 0.6156422940098166,
|
|
"ci95": 0.1101523827825787,
|
|
"min": 4.96,
|
|
"max": 8.61,
|
|
"length_correlation": -0.12151998145182114
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"count": 120,
|
|
"mean": 6.96125,
|
|
"median": 7.09,
|
|
"stdev": 0.9717000284954979,
|
|
"ci95": 0.17385919474689632,
|
|
"min": 3.32,
|
|
"max": 8.82,
|
|
"length_correlation": 0.009690079063523201
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"count": 120,
|
|
"mean": 7.570666666666667,
|
|
"median": 7.68,
|
|
"stdev": 0.6572756777232903,
|
|
"ci95": 0.11760154029492369,
|
|
"min": 5.32,
|
|
"max": 8.86,
|
|
"length_correlation": 0.28105412927410267
|
|
},
|
|
"DeepSeek-R1": {
|
|
"count": 120,
|
|
"mean": 7.841583333333333,
|
|
"median": 7.86,
|
|
"stdev": 0.6271584521391989,
|
|
"ci95": 0.11221288491311002,
|
|
"min": 5.82,
|
|
"max": 9.25,
|
|
"length_correlation": 0.025649596842388763
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"count": 120,
|
|
"mean": 5.1810833333333335,
|
|
"median": 5.055,
|
|
"stdev": 1.1602490284948526,
|
|
"ci95": 0.20759489130849443,
|
|
"min": 2.29,
|
|
"max": 8.46,
|
|
"length_correlation": -0.2663994108328025
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"count": 120,
|
|
"mean": 5.94425,
|
|
"median": 6.07,
|
|
"stdev": 1.2512297564241928,
|
|
"ci95": 0.22387340899030678,
|
|
"min": 2.14,
|
|
"max": 8.0,
|
|
"length_correlation": -0.20653971830241444
|
|
}
|
|
},
|
|
"calibrated_model_stats": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"count": 120,
|
|
"mean": 6.24341110428404,
|
|
"median": 6.617647058823529,
|
|
"stdev": 1.513914202721647,
|
|
"ci95": 0.270873619926311,
|
|
"min": 2.7735408560311283,
|
|
"max": 9.63157894736842,
|
|
"length_correlation": -0.026463093191221542
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"count": 120,
|
|
"mean": 5.119678976573259,
|
|
"median": 5.017583650375572,
|
|
"stdev": 1.814084891038744,
|
|
"ci95": 0.32458097057673235,
|
|
"min": 1.4661478599221789,
|
|
"max": 8.56140350877193,
|
|
"length_correlation": -0.07865140111152846
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"count": 120,
|
|
"mean": 6.091662461412351,
|
|
"median": 6.352941176470589,
|
|
"stdev": 1.6768172145070779,
|
|
"ci95": 0.3000206669781784,
|
|
"min": 2.2412451361867705,
|
|
"max": 9.19298245614035,
|
|
"length_correlation": 0.025930644332760197
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"count": 120,
|
|
"mean": 6.328679541671164,
|
|
"median": 6.352941176470589,
|
|
"stdev": 1.3116010422863964,
|
|
"ci95": 0.23467520258713187,
|
|
"min": 2.4747081712062258,
|
|
"max": 8.87719298245614,
|
|
"length_correlation": 6.980625051840469e-05
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 5.321506521636969,
|
|
"median": 5.411764705882352,
|
|
"stdev": 1.7097698147710712,
|
|
"ci95": 0.30591663525923807,
|
|
"min": 2.2692607003891054,
|
|
"max": 9.438596491228068,
|
|
"length_correlation": -0.17078683377698892
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 4.152091731414124,
|
|
"median": 3.9983416252072965,
|
|
"stdev": 1.6761450332029226,
|
|
"ci95": 0.29990039848292555,
|
|
"min": 1.3073929961089494,
|
|
"max": 7.877192982456139,
|
|
"length_correlation": -0.11736501907569356
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"count": 120,
|
|
"mean": 3.4758824550941814,
|
|
"median": 2.989800027101845,
|
|
"stdev": 1.820297766680272,
|
|
"ci95": 0.3256925950744398,
|
|
"min": 0.8031128404669261,
|
|
"max": 9.0,
|
|
"length_correlation": 0.10778435780550603
|
|
},
|
|
"gemma-7b-it": {
|
|
"count": 120,
|
|
"mean": 2.8834912622787794,
|
|
"median": 2.4186770428015567,
|
|
"stdev": 1.582519013935895,
|
|
"ci95": 0.2831485781270838,
|
|
"min": 0.23346303501945526,
|
|
"max": 7.929824561403509,
|
|
"length_correlation": -0.014795831169776529
|
|
},
|
|
"gemma-2b-it": {
|
|
"count": 120,
|
|
"mean": 2.222853941516765,
|
|
"median": 1.8536964980544746,
|
|
"stdev": 1.352177893315551,
|
|
"ci95": 0.24193532241672216,
|
|
"min": 0.037354085603112874,
|
|
"max": 7.807017543859649,
|
|
"length_correlation": -0.00626006886845542
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"count": 120,
|
|
"mean": 4.74222309343619,
|
|
"median": 4.734660033167495,
|
|
"stdev": 1.8597559926272333,
|
|
"ci95": 0.3327525674816667,
|
|
"min": 0.43891050583657565,
|
|
"max": 8.49122807017544,
|
|
"length_correlation": -0.11281384823163548
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"count": 120,
|
|
"mean": 4.561574143297052,
|
|
"median": 4.708126036484245,
|
|
"stdev": 1.809207009828139,
|
|
"ci95": 0.3237082068899185,
|
|
"min": 1.4381322957198444,
|
|
"max": 8.05263157894737,
|
|
"length_correlation": 0.1795645363776789
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"count": 120,
|
|
"mean": 6.5732934468467645,
|
|
"median": 6.970588235294117,
|
|
"stdev": 1.2699040160768762,
|
|
"ci95": 0.22721465798742466,
|
|
"min": 2.6334630350194552,
|
|
"max": 8.87719298245614,
|
|
"length_correlation": -0.1257748777008509
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"count": 120,
|
|
"mean": 5.691325221179402,
|
|
"median": 5.676470588235294,
|
|
"stdev": 1.6619707523199512,
|
|
"ci95": 0.2973642978467597,
|
|
"min": 1.1019455252918284,
|
|
"max": 9.24561403508772,
|
|
"length_correlation": 0.017810969750636504
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"count": 120,
|
|
"mean": 6.873329557318088,
|
|
"median": 7.245614035087719,
|
|
"stdev": 1.3590909910144606,
|
|
"ci95": 0.24317223253701922,
|
|
"min": 2.969649805447471,
|
|
"max": 9.315789473684209,
|
|
"length_correlation": 0.2908432942183641
|
|
},
|
|
"DeepSeek-R1": {
|
|
"count": 120,
|
|
"mean": 7.416395148440802,
|
|
"median": 7.561403508771931,
|
|
"stdev": 1.2681644766715765,
|
|
"ci95": 0.2269034148965871,
|
|
"min": 3.6202321724709785,
|
|
"max": 10.0,
|
|
"length_correlation": 0.03010622668942908
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"count": 120,
|
|
"mean": 3.059306085973714,
|
|
"median": 2.7221789883268483,
|
|
"stdev": 1.5069732482219282,
|
|
"ci95": 0.2696317255919408,
|
|
"min": 0.14007782101167307,
|
|
"max": 8.6140350877193,
|
|
"length_correlation": -0.24411670522128445
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"count": 120,
|
|
"mean": 4.103878272900874,
|
|
"median": 3.951907131011609,
|
|
"stdev": 1.7688936473493926,
|
|
"ci95": 0.31649523114374456,
|
|
"min": 0.0,
|
|
"max": 7.807017543859649,
|
|
"length_correlation": -0.18590488060014362
|
|
}
|
|
},
|
|
"raw_cross_model_stats": {
|
|
"anova_f": 113.84480855592682,
|
|
"anova_p": 1.0572637739373464e-267,
|
|
"kw_stat": 926.6659009318737,
|
|
"kw_p": 5.526814934051557e-187,
|
|
"std_dev_across_models": 0.9645355345624542,
|
|
"pearson_r": 0.9360187342643919,
|
|
"kendall_tau": 0.8970588235294117,
|
|
"normalized_components": {
|
|
"pearson_r": 0.7867291142146395,
|
|
"kendall_tau": 0.8856209150326797,
|
|
"anova_f": 0.3252708815883623,
|
|
"kw_stat": 0.6177772672879158,
|
|
"std_dev": 0.4384252429829337,
|
|
"ci99_overlap_magnitude_sum_norm": 0.6986015831245652,
|
|
"raw_score_range_norm": 0.4295520833333333,
|
|
"kendall_tau_bootstrapped": 0.8133088235294117
|
|
}
|
|
},
|
|
"calibrated_cross_model_stats": {
|
|
"anova_f": 108.5092187521208,
|
|
"anova_p": 6.47520439044529e-258,
|
|
"kw_stat": 926.6659009318737,
|
|
"kw_p": 5.526814934051557e-187,
|
|
"std_dev_across_models": 1.4749530584286665,
|
|
"pearson_r": 0.9546474467408973,
|
|
"kendall_tau": 0.9029411764705882,
|
|
"normalized_components": {
|
|
"pearson_r": 0.8488248224696577,
|
|
"kendall_tau": 0.8921568627450981,
|
|
"anova_f": 0.3100263392917737,
|
|
"kw_stat": 0.6177772672879158,
|
|
"std_dev": 0.6704332083766665,
|
|
"ci99_overlap_magnitude_sum_norm": 0.5037974412199329,
|
|
"calibrated_score_range_norm": 0.6491926508655046,
|
|
"kendall_tau_bootstrapped": 0.807892156862745
|
|
}
|
|
},
|
|
"separability_metrics": {
|
|
"raw": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": true,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": true,
|
|
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": true,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": false
|
|
},
|
|
"adjacent_overlap_fraction": 0.9375,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.1821160354984599,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.29613750523395055,
|
|
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.34062922131451323,
|
|
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.42019313532954694,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.5006118267724062,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.4625496863832508,
|
|
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": 0.47669916432971426,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 0.5840719778569223,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.5654698974419787,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.738070266415793,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.5648235425017241,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.7866099439955452,
|
|
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": 0.4098053900250598,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.6035489150447564,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": 0.6522991320498139,
|
|
"gemma-7b-it__gemma-2b-it": 0.2527231985678693
|
|
},
|
|
"ci99_overlap_magnitude_sum": 7.836358838761305,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.21447235149309962,
|
|
"emd": {
|
|
"average": 1.1634509803921576,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 0.6366666666666666,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.12666666666666662,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.12349999999999997,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.49175,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 1.2431666666666665,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 1.7959999999999998,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 2.2543333333333333,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 2.840583333333334,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 0.8814166666666667,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 0.9845833333333336,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.1859166666666667,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.28633333333333333,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.3297500000000001,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 0.5958333333333333,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 2.0646666666666667,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 1.3014999999999999,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.5483333333333333,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 0.7108333333333332,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.1615833333333333,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.6065,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.1635,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 1.6176666666666668,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.2039166666666667,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.24475000000000002,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.34858333333333336,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 0.8087500000000001,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.3695,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 0.9615833333333333,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 1.2325,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 1.4285,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 0.6648333333333333,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.2078333333333333,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.40575,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 1.1548333333333334,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 1.7076666666666667,
|
|
"claude-3-opus-20240229__gemma-7b-it": 2.1660000000000004,
|
|
"claude-3-opus-20240229__gemma-2b-it": 2.75225,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 0.7930833333333334,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 0.89625,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.27491666666666664,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.19966666666666666,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.4139166666666667,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 0.6841666666666668,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 1.9763333333333333,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 1.2131666666666667,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 0.57075,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 1.3173333333333335,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 1.8713333333333333,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 2.3285,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 2.9147499999999997,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 0.9555833333333333,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 1.0587500000000003,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.1090833333333334,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.3653333333333333,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.2525833333333334,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 0.5216666666666667,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 2.1388333333333334,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 1.3756666666666666,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 0.7519166666666666,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.3047499999999999,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 1.7630833333333333,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 2.3493333333333335,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.39016666666666666,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.4933333333333333,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 0.6686666666666667,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.27341666666666664,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 0.8173333333333334,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 1.0870833333333332,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 1.5734166666666667,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 0.8102499999999999,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.5706666666666667,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.0146666666666668,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.5974166666666667,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.3814166666666666,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.2680833333333333,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 1.41525,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.9653333333333333,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 1.5680833333333333,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 1.8390000000000002,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 0.8350000000000002,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.08966666666666662,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.46033333333333337,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 1.0445833333333332,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.9259166666666667,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 0.8204166666666666,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 1.9692499999999997,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 1.5115,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 2.120916666666667,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 2.3918333333333335,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.31633333333333336,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.5248333333333334,
|
|
"gemma-7b-it__gemma-2b-it": 0.58625,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.3729166666666668,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.26975,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 2.4264166666666664,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 1.9698333333333333,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 2.57925,
|
|
"gemma-7b-it__DeepSeek-R1": 2.8501666666666665,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.2555,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 0.9606666666666668,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 1.9591666666666665,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 1.8559999999999999,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 3.012666666666667,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 2.556083333333333,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 3.1654999999999998,
|
|
"gemma-2b-it__DeepSeek-R1": 3.4364166666666662,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 0.7759166666666666,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.53975,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.14083333333333337,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.0535,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.5969166666666667,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 1.2063333333333333,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 1.4772500000000002,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.1855833333333334,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.42074999999999996,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 1.1566666666666667,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 0.71275,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 1.3095,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 1.5804166666666668,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 1.08725,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.31691666666666657,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.4600833333333333,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.15399999999999997,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 0.42374999999999996,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 2.23675,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 1.4735833333333332,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 0.6094166666666667,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 0.8803333333333334,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 1.7801666666666667,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.017,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.2709166666666667,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 2.3895833333333334,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 1.6264166666666666,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 2.6605,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 1.8973333333333335,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 0.7793333333333335
|
|
}
|
|
},
|
|
"average_ci95": 0.17755126912477992,
|
|
"modulated_ci95": 0.48693933310099247
|
|
},
|
|
"calibrated": {
|
|
"ci99_overlap_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": true,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": true,
|
|
"gemini-1.5-pro-002__gemini-1.5-pro-001": true,
|
|
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": true,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": true,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": true,
|
|
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": true,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": true,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": true,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": true,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": true,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": true,
|
|
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": true,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": true,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": true,
|
|
"gemma-7b-it__gemma-2b-it": true
|
|
},
|
|
"adjacent_overlap_fraction": 1.0,
|
|
"ci99_overlap_magnitude_adjacent": {
|
|
"DeepSeek-R1__gpt-4o-2024-11-20": 0.38359356261009303,
|
|
"gpt-4o-2024-11-20__gemini-1.5-pro-002": 0.6272365961241686,
|
|
"gemini-1.5-pro-002__gemini-1.5-pro-001": 0.6659086238142393,
|
|
"gemini-1.5-pro-001__claude-3-5-sonnet-20240620": 0.911318913177749,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.9736540867996668,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.7772865081565445,
|
|
"Mistral-Large-Instruct-2411__Llama-3-70b-chat-hf": 0.8194277589525303,
|
|
"Llama-3-70b-chat-hf__claude-3-haiku-20240307": 1.0410710842192366,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.9183443639570381,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 1.1134308197001967,
|
|
"c4ai-command-r-08-2024__Mixtral-8x7B-Instruct-v0.1": 0.819835945003272,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 1.166885974169289,
|
|
"databricks/dbrx-instruct__Llama-2-13b-chat-hf": 0.6379477187916502,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.7569852356916282,
|
|
"gpt-3.5-turbo-0125__gemma-7b-it": 0.9138798483031199,
|
|
"gemma-7b-it__gemma-2b-it": 0.3744594888113215
|
|
},
|
|
"ci99_overlap_magnitude_sum": 12.901266528281745,
|
|
"ci99_overlap_scale_factor": 1.5,
|
|
"average_cohens_d_adjacent": 0.20637131038030193,
|
|
"emd": {
|
|
"average": 1.8020993221045423,
|
|
"pairs": {
|
|
"claude-3-5-sonnet-20240620__claude-3-haiku-20240307": 1.1237321277107815,
|
|
"claude-3-5-sonnet-20240620__claude-3-opus-20240229": 0.21979129853044369,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-001": 0.1981007193203076,
|
|
"claude-3-5-sonnet-20240620__Llama-3-70b-chat-hf": 0.9227817756295282,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x7B-Instruct-v0.1": 2.0913193728699158,
|
|
"claude-3-5-sonnet-20240620__Llama-2-13b-chat-hf": 2.7675286491898587,
|
|
"claude-3-5-sonnet-20240620__gemma-7b-it": 3.3599198420052607,
|
|
"claude-3-5-sonnet-20240620__gemma-2b-it": 4.020557162767275,
|
|
"claude-3-5-sonnet-20240620__Mixtral-8x22B-Instruct-v0.1": 1.5011880108478501,
|
|
"claude-3-5-sonnet-20240620__c4ai-command-r-08-2024": 1.6818369609869883,
|
|
"claude-3-5-sonnet-20240620__gemini-1.5-pro-002": 0.35210001384859124,
|
|
"claude-3-5-sonnet-20240620__Mistral-Large-Instruct-2411": 0.5553022573736441,
|
|
"claude-3-5-sonnet-20240620__gpt-4o-2024-11-20": 0.6383979851977899,
|
|
"claude-3-5-sonnet-20240620__DeepSeek-R1": 1.172984044156762,
|
|
"claude-3-5-sonnet-20240620__gpt-3.5-turbo-0125": 3.184105018310327,
|
|
"claude-3-5-sonnet-20240620__databricks/dbrx-instruct": 2.1395328313831667,
|
|
"claude-3-haiku-20240307__claude-3-opus-20240229": 0.9719834848390921,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-001": 1.2090005650979048,
|
|
"claude-3-haiku-20240307__Llama-3-70b-chat-hf": 0.23135718444848474,
|
|
"claude-3-haiku-20240307__Mixtral-8x7B-Instruct-v0.1": 0.9675872451591345,
|
|
"claude-3-haiku-20240307__Llama-2-13b-chat-hf": 1.6511064629995456,
|
|
"claude-3-haiku-20240307__gemma-7b-it": 2.2361877142944797,
|
|
"claude-3-haiku-20240307__gemma-2b-it": 2.8968250350564935,
|
|
"claude-3-haiku-20240307__Mixtral-8x22B-Instruct-v0.1": 0.37745588313706874,
|
|
"claude-3-haiku-20240307__c4ai-command-r-08-2024": 0.5592744239194818,
|
|
"claude-3-haiku-20240307__gemini-1.5-pro-002": 1.4536144702735054,
|
|
"claude-3-haiku-20240307__Mistral-Large-Instruct-2411": 0.5878330150341587,
|
|
"claude-3-haiku-20240307__gpt-4o-2024-11-20": 1.7536505807448286,
|
|
"claude-3-haiku-20240307__DeepSeek-R1": 2.2967161718675433,
|
|
"claude-3-haiku-20240307__gpt-3.5-turbo-0125": 2.0612500835820016,
|
|
"claude-3-haiku-20240307__databricks/dbrx-instruct": 1.015800703672385,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-001": 0.32347218861794563,
|
|
"claude-3-opus-20240229__Llama-3-70b-chat-hf": 0.7747164330968839,
|
|
"claude-3-opus-20240229__Mixtral-8x7B-Instruct-v0.1": 1.9395707299982266,
|
|
"claude-3-opus-20240229__Llama-2-13b-chat-hf": 2.6157800063181695,
|
|
"claude-3-opus-20240229__gemma-7b-it": 3.208171199133572,
|
|
"claude-3-opus-20240229__gemma-2b-it": 3.8688085198955866,
|
|
"claude-3-opus-20240229__Mixtral-8x22B-Instruct-v0.1": 1.349439367976161,
|
|
"claude-3-opus-20240229__c4ai-command-r-08-2024": 1.5300883181152996,
|
|
"claude-3-opus-20240229__gemini-1.5-pro-002": 0.5070695819256412,
|
|
"claude-3-opus-20240229__Mistral-Large-Instruct-2411": 0.40519453271789274,
|
|
"claude-3-opus-20240229__gpt-4o-2024-11-20": 0.7828366865490113,
|
|
"claude-3-opus-20240229__DeepSeek-R1": 1.324732687028451,
|
|
"claude-3-opus-20240229__gpt-3.5-turbo-0125": 3.0323563754386376,
|
|
"claude-3-opus-20240229__databricks/dbrx-instruct": 1.987784188511477,
|
|
"gemini-1.5-pro-001__Llama-3-70b-chat-hf": 1.0165297451803943,
|
|
"gemini-1.5-pro-001__Mixtral-8x7B-Instruct-v0.1": 2.176587810257039,
|
|
"gemini-1.5-pro-001__Llama-2-13b-chat-hf": 2.8548438702027132,
|
|
"gemini-1.5-pro-001__gemma-7b-it": 3.4451882793923843,
|
|
"gemini-1.5-pro-001__gemma-2b-it": 4.1058256001543985,
|
|
"gemini-1.5-pro-001__Mixtral-8x22B-Instruct-v0.1": 1.5864564482349737,
|
|
"gemini-1.5-pro-001__c4ai-command-r-08-2024": 1.7671053983741118,
|
|
"gemini-1.5-pro-001__gemini-1.5-pro-002": 0.2607117783826648,
|
|
"gemini-1.5-pro-001__Mistral-Large-Instruct-2411": 0.6490502269245101,
|
|
"gemini-1.5-pro-001__gpt-4o-2024-11-20": 0.5470822986762216,
|
|
"gemini-1.5-pro-001__DeepSeek-R1": 1.0877156067696385,
|
|
"gemini-1.5-pro-001__gpt-3.5-turbo-0125": 3.2693734556974503,
|
|
"gemini-1.5-pro-001__databricks/dbrx-instruct": 2.22480126877029,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x7B-Instruct-v0.1": 1.1694147902228438,
|
|
"Llama-3-70b-chat-hf__Llama-2-13b-chat-hf": 1.845624066542787,
|
|
"Llama-3-70b-chat-hf__gemma-7b-it": 2.438015259358189,
|
|
"Llama-3-70b-chat-hf__gemma-2b-it": 3.0986525801202034,
|
|
"Llama-3-70b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 0.579283428200778,
|
|
"Llama-3-70b-chat-hf__c4ai-command-r-08-2024": 0.7599323783399162,
|
|
"Llama-3-70b-chat-hf__gemini-1.5-pro-002": 1.261143650355995,
|
|
"Llama-3-70b-chat-hf__Mistral-Large-Instruct-2411": 0.43399008780556897,
|
|
"Llama-3-70b-chat-hf__gpt-4o-2024-11-20": 1.55386981930685,
|
|
"Llama-3-70b-chat-hf__DeepSeek-R1": 2.094888626803834,
|
|
"Llama-3-70b-chat-hf__gpt-3.5-turbo-0125": 2.262200435663255,
|
|
"Llama-3-70b-chat-hf__databricks/dbrx-instruct": 1.2176282487360948,
|
|
"Mixtral-8x7B-Instruct-v0.1__Llama-2-13b-chat-hf": 0.7082870196979961,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-7b-it": 1.2747408200125383,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemma-2b-it": 1.9292377898973596,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mixtral-8x22B-Instruct-v0.1": 0.6084971207769296,
|
|
"Mixtral-8x7B-Instruct-v0.1__c4ai-command-r-08-2024": 0.4183540072136669,
|
|
"Mixtral-8x7B-Instruct-v0.1__gemini-1.5-pro-002": 2.42120171543264,
|
|
"Mixtral-8x7B-Instruct-v0.1__Mistral-Large-Instruct-2411": 1.54545917069913,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-4o-2024-11-20": 2.7212378259039633,
|
|
"Mixtral-8x7B-Instruct-v0.1__DeepSeek-R1": 3.264303417026678,
|
|
"Mixtral-8x7B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.116469855966727,
|
|
"Mixtral-8x7B-Instruct-v0.1__databricks/dbrx-instruct": 0.1101479381617914,
|
|
"Llama-2-13b-chat-hf__gemma-7b-it": 0.595352942195337,
|
|
"Llama-2-13b-chat-hf__gemma-2b-it": 1.2530285135774162,
|
|
"Llama-2-13b-chat-hf__Mixtral-8x22B-Instruct-v0.1": 1.2808902094162575,
|
|
"Llama-2-13b-chat-hf__c4ai-command-r-08-2024": 1.101481161887081,
|
|
"Llama-2-13b-chat-hf__gemini-1.5-pro-002": 3.099457775378314,
|
|
"Llama-2-13b-chat-hf__Mistral-Large-Instruct-2411": 2.215442766085221,
|
|
"Llama-2-13b-chat-hf__gpt-4o-2024-11-20": 3.397447102223906,
|
|
"Llama-2-13b-chat-hf__DeepSeek-R1": 3.940512693346621,
|
|
"Llama-2-13b-chat-hf__gpt-3.5-turbo-0125": 0.4650559012842105,
|
|
"Llama-2-13b-chat-hf__databricks/dbrx-instruct": 0.6708357961763596,
|
|
"gemma-7b-it__gemma-2b-it": 0.6606373207620142,
|
|
"gemma-7b-it__Mixtral-8x22B-Instruct-v0.1": 1.858731831157411,
|
|
"gemma-7b-it__c4ai-command-r-08-2024": 1.6780828810182729,
|
|
"gemma-7b-it__gemini-1.5-pro-002": 3.689802184567985,
|
|
"gemma-7b-it__Mistral-Large-Instruct-2411": 2.807833958900623,
|
|
"gemma-7b-it__gpt-4o-2024-11-20": 3.9898382950393083,
|
|
"gemma-7b-it__DeepSeek-R1": 4.532903886162023,
|
|
"gemma-7b-it__gpt-3.5-turbo-0125": 0.2703306401266234,
|
|
"gemma-7b-it__databricks/dbrx-instruct": 1.2307108097437638,
|
|
"gemma-2b-it__Mixtral-8x22B-Instruct-v0.1": 2.5193691519194252,
|
|
"gemma-2b-it__c4ai-command-r-08-2024": 2.3387202017802866,
|
|
"gemma-2b-it__gemini-1.5-pro-002": 4.350439505329999,
|
|
"gemma-2b-it__Mistral-Large-Instruct-2411": 3.4684712796626376,
|
|
"gemma-2b-it__gpt-4o-2024-11-20": 4.650475615801322,
|
|
"gemma-2b-it__DeepSeek-R1": 5.193541206924037,
|
|
"gemma-2b-it__gpt-3.5-turbo-0125": 0.8364521444569485,
|
|
"gemma-2b-it__databricks/dbrx-instruct": 1.8816468994774942,
|
|
"Mixtral-8x22B-Instruct-v0.1__c4ai-command-r-08-2024": 0.22192579344666363,
|
|
"Mixtral-8x22B-Instruct-v0.1__gemini-1.5-pro-002": 1.8310703534105746,
|
|
"Mixtral-8x22B-Instruct-v0.1__Mistral-Large-Instruct-2411": 0.9491021277432119,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-4o-2024-11-20": 2.1311064638818973,
|
|
"Mixtral-8x22B-Instruct-v0.1__DeepSeek-R1": 2.674172055004612,
|
|
"Mixtral-8x22B-Instruct-v0.1__gpt-3.5-turbo-0125": 1.6870105747139386,
|
|
"Mixtral-8x22B-Instruct-v0.1__databricks/dbrx-instruct": 0.6389673886287017,
|
|
"c4ai-command-r-08-2024__gemini-1.5-pro-002": 2.0117193035497127,
|
|
"c4ai-command-r-08-2024__Mistral-Large-Instruct-2411": 1.1415798716566692,
|
|
"c4ai-command-r-08-2024__gpt-4o-2024-11-20": 2.311755414021036,
|
|
"c4ai-command-r-08-2024__DeepSeek-R1": 2.8548210051437506,
|
|
"c4ai-command-r-08-2024__gpt-3.5-turbo-0125": 1.5148411567385431,
|
|
"c4ai-command-r-08-2024__databricks/dbrx-instruct": 0.4576958703961783,
|
|
"gemini-1.5-pro-002__Mistral-Large-Instruct-2411": 0.8881085765445554,
|
|
"gemini-1.5-pro-002__gpt-4o-2024-11-20": 0.30137113444355,
|
|
"gemini-1.5-pro-002__DeepSeek-R1": 0.8431017015940377,
|
|
"gemini-1.5-pro-002__gpt-3.5-turbo-0125": 3.5139873608730507,
|
|
"gemini-1.5-pro-002__databricks/dbrx-instruct": 2.4694151739458903,
|
|
"Mistral-Large-Instruct-2411__gpt-4o-2024-11-20": 1.1820043361386854,
|
|
"Mistral-Large-Instruct-2411__DeepSeek-R1": 1.7250699272614,
|
|
"Mistral-Large-Instruct-2411__gpt-3.5-turbo-0125": 2.632019135205689,
|
|
"Mistral-Large-Instruct-2411__databricks/dbrx-instruct": 1.5874469482785283,
|
|
"gpt-4o-2024-11-20__DeepSeek-R1": 0.5430655911227148,
|
|
"gpt-4o-2024-11-20__gpt-3.5-turbo-0125": 3.814023471344374,
|
|
"gpt-4o-2024-11-20__databricks/dbrx-instruct": 2.7694512844172134,
|
|
"DeepSeek-R1__gpt-3.5-turbo-0125": 4.357089062467089,
|
|
"DeepSeek-R1__databricks/dbrx-instruct": 3.3125168755399286,
|
|
"gpt-3.5-turbo-0125__databricks/dbrx-instruct": 1.0708834254644892
|
|
}
|
|
},
|
|
"average_ci95": 0.28376390140022495,
|
|
"modulated_ci95": 0.2858863458378721
|
|
}
|
|
},
|
|
"calibrated_score_range": 5.193541206924037,
|
|
"iteration_stability": {
|
|
"raw": {
|
|
"scoring_stability": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"mean_iter_score": 7.24575,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.15694443461160262
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"mean_iter_score": 6.609083333333333,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.17636519088401648
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"mean_iter_score": 7.157416666666666,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1706436752091587
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"mean_iter_score": 7.319916666666667,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.1319146925520852
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"mean_iter_score": 6.7545,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.17172761862903718
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"mean_iter_score": 6.002583333333334,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.21693755501936
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"mean_iter_score": 5.44975,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.12699316473294484
|
|
},
|
|
"gemma-7b-it": {
|
|
"mean_iter_score": 4.991416666666667,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.04307261310856343
|
|
},
|
|
"gemma-2b-it": {
|
|
"mean_iter_score": 4.405166666666666,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.14251671441935812
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"mean_iter_score": 6.364333333333334,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.22905824708042175
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"mean_iter_score": 6.261166666666667,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.13324043638809097
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"mean_iter_score": 7.417833333333333,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.06779841443573752
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"mean_iter_score": 6.96125,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.15568330567747668
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"mean_iter_score": 7.570666666666667,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.042639965603488525
|
|
},
|
|
"DeepSeek-R1": {
|
|
"mean_iter_score": 7.841583333333333,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.0666126864793788
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"mean_iter_score": 5.1810833333333335,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.19121007208477964
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"mean_iter_score": 5.94425,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.21119491392234477
|
|
}
|
|
},
|
|
"ranking_stability": {
|
|
"pairwise_correlation": {
|
|
"1__vs__2": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8823529411764705,
|
|
"p_value": 3.5743855407137387e-09
|
|
},
|
|
"1__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
},
|
|
"1__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.926470588235294,
|
|
"p_value": 1.080161877119549e-10
|
|
},
|
|
"1__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9117647058823529,
|
|
"p_value": 3.8599058936360526e-10
|
|
},
|
|
"2__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8382352941176471,
|
|
"p_value": 5.634316092440314e-08
|
|
},
|
|
"2__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8676470588235293,
|
|
"p_value": 9.575975226992579e-09
|
|
},
|
|
"2__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9117647058823529,
|
|
"p_value": 3.8599058936360526e-10
|
|
},
|
|
"3__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9117647058823529,
|
|
"p_value": 3.8599058936360526e-10
|
|
},
|
|
"3__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
},
|
|
"4__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
}
|
|
},
|
|
"average_kendall_tau": 0.8941176470588235
|
|
},
|
|
"randomized_average_kendall_tau_by_item": 0.887985294117647
|
|
},
|
|
"calibrated": {
|
|
"scoring_stability": {
|
|
"claude-3-5-sonnet-20240620": {
|
|
"mean_iter_score": 6.24341110428404,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.302565394802204
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"mean_iter_score": 5.119678976573259,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.25665401866270493
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"mean_iter_score": 6.091662461412351,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.3447684735223457
|
|
},
|
|
"gemini-1.5-pro-001": {
|
|
"mean_iter_score": 6.328679541671164,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.23641743672897256
|
|
},
|
|
"Llama-3-70b-chat-hf": {
|
|
"mean_iter_score": 5.321506521636969,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2952207909827205
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"mean_iter_score": 4.152091731414124,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2792172386805539
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"mean_iter_score": 3.4758824550941814,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.21334784570102575
|
|
},
|
|
"gemma-7b-it": {
|
|
"mean_iter_score": 2.8834912622787794,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.0282201567909068
|
|
},
|
|
"gemma-2b-it": {
|
|
"mean_iter_score": 2.222853941516765,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.13719101349050422
|
|
},
|
|
"Mixtral-8x22B-Instruct-v0.1": {
|
|
"mean_iter_score": 4.74222309343619,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2722477982658716
|
|
},
|
|
"c4ai-command-r-08-2024": {
|
|
"mean_iter_score": 4.561574143297053,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.19390065298660353
|
|
},
|
|
"gemini-1.5-pro-002": {
|
|
"mean_iter_score": 6.5732934468467645,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.10303097674767771
|
|
},
|
|
"Mistral-Large-Instruct-2411": {
|
|
"mean_iter_score": 5.691325221179402,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.19681980003611643
|
|
},
|
|
"gpt-4o-2024-11-20": {
|
|
"mean_iter_score": 6.873329557318088,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.0825027225675669
|
|
},
|
|
"DeepSeek-R1": {
|
|
"mean_iter_score": 7.416395148440802,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.13649461417547062
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"mean_iter_score": 3.059306085973714,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.2162664674561692
|
|
},
|
|
"databricks/dbrx-instruct": {
|
|
"mean_iter_score": 4.103878272900874,
|
|
"iteration_count": 5,
|
|
"stdev_across_iters": 0.34399539323170275
|
|
}
|
|
},
|
|
"ranking_stability": {
|
|
"pairwise_correlation": {
|
|
"1__vs__2": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
},
|
|
"1__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
},
|
|
"1__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9558823529411764,
|
|
"p_value": 5.347391697765181e-12
|
|
},
|
|
"1__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9705882352941175,
|
|
"p_value": 8.546830053210383e-13
|
|
},
|
|
"2__vs__3": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8235294117647057,
|
|
"p_value": 1.25716599654265e-07
|
|
},
|
|
"2__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8823529411764705,
|
|
"p_value": 3.5743855407137387e-09
|
|
},
|
|
"2__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
},
|
|
"3__vs__4": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9117647058823529,
|
|
"p_value": 3.8599058936360526e-10
|
|
},
|
|
"3__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.8970588235294118,
|
|
"p_value": 1.2313901628307946e-09
|
|
},
|
|
"4__vs__5": {
|
|
"common_model_count": 17,
|
|
"kendall_tau": 0.9558823529411764,
|
|
"p_value": 5.347391697765181e-12
|
|
}
|
|
},
|
|
"average_kendall_tau": 0.9088235294117647
|
|
},
|
|
"randomized_average_kendall_tau_by_item": 0.884735294117647
|
|
}
|
|
},
|
|
"final_judgemark_score": 0.6364094748361991,
|
|
"raw_score_range": 3.4364166666666662,
|
|
"final_judgemark_score_raw": 0.6122836514189908,
|
|
"final_judgemark_score_elements_raw": {
|
|
"norm_stability_between_iterations": 0.8133088235294117,
|
|
"norm_correlation_with_lmsys_arena": 0.8856209150326797,
|
|
"norm_std_dev_between_models": 0.4384252429829337,
|
|
"norm_kruskall_wallis": 0.6177772672879158,
|
|
"norm_ci99_adjacent_overlap": 0.6986015831245652,
|
|
"norm_score_range": 0.4295520833333333,
|
|
"norm_intra_model_ci95": 0.48693933310099247,
|
|
"norm_earth_movers_distance": 0.2908627450980394
|
|
},
|
|
"final_judgemark_score_elements_calibrated": {
|
|
"norm_stability_between_iterations": 0.807892156862745,
|
|
"norm_correlation_with_lmsys_arena": 0.8921568627450981,
|
|
"norm_std_dev_between_models": 0.6704332083766665,
|
|
"norm_kruskall_wallis": 0.6177772672879158,
|
|
"norm_ci99_adjacent_overlap": 0.5037974412199329,
|
|
"norm_score_range": 0.6491926508655046,
|
|
"norm_intra_model_ci95": 0.2858863458378721,
|
|
"norm_earth_movers_distance": {
|
|
"pearson_r": 0.8488248224696577,
|
|
"kendall_tau": 0.8921568627450981,
|
|
"anova_f": 0.3100263392917737,
|
|
"kw_stat": 0.6177772672879158,
|
|
"std_dev": 0.6704332083766665,
|
|
"ci99_overlap_magnitude_sum_norm": 0.5037974412199329,
|
|
"calibrated_score_range_norm": 0.6491926508655046,
|
|
"kendall_tau_bootstrapped": 0.807892156862745
|
|
}
|
|
}
|
|
} |