mirror of
https://github.com/wassname/Judgemark-v2lp.git
synced 2026-06-27 16:10:14 +08:00
results
This commit is contained in:
@@ -12,14 +12,15 @@ This project compares different methods of extracting scores from language model
|
||||
|
||||
## Results
|
||||
|
||||
| Method | Score | Score (Normalized) |
|
||||
|---------------|-------|-------------------|
|
||||
| ranked_scaled | 0.67 | 0.79 |
|
||||
| ranked_norm | 0.67 | 0.73 |
|
||||
| weighted | 0.63 | 0.65 |
|
||||
| raw | 0.63 | 0.65 |
|
||||
| weighted_norm | 0.62 | 0.64 |
|
||||
| ranked | 0.33 | 0.28 |
|
||||
|
||||
| Method | Score | Score (Normalized) |
|
||||
|---------------|----------|------------|
|
||||
| ranked_scaled | 0.629332 | 0.80 |
|
||||
| ranked_norm | 0.654562 | 0.74 |
|
||||
| weighted | 0.634804 | 0.65 |
|
||||
| raw | 0.634528 | 0.65 |
|
||||
| weighted_norm | 0.623806 | 0.64 |
|
||||
| ranked | 0.336333 | 0.28 |
|
||||
|
||||
*Results for DeepSeek Chat V3 0324*
|
||||
|
||||
|
||||
@@ -341,7 +341,7 @@ def compute_ranked_score(logp):
|
||||
# res = kendalltau(choices, logp_arr, variant='b')
|
||||
|
||||
# lets just use the common numbers 1,3,5,7,9, as some models like to skip some
|
||||
res = kendalltau(choices[1::2], logp_arr[1::2], variant='b')
|
||||
res = kendalltau(choices, logp_arr, variant='b')
|
||||
# print(res.correlation, res.pvalue, res2.correlation, res2.pvalue)
|
||||
# correlation weighted by pvalue
|
||||
|
||||
|
||||
+75
-173
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user