results

2026-06-27 16:10:14 +08:00 · 2025-07-26 12:46:27 +08:00
parent 3a0b8f650c
commit 3c7e7d58b7
3 changed files with 85 additions and 182 deletions
@@ -12,14 +12,15 @@ This project compares different methods of extracting scores from language model

 ## Results

-| Method        | Score | Score (Normalized) |
-|---------------|-------|-------------------|
-| ranked_scaled | 0.67  | 0.79             |
-| ranked_norm   | 0.67  | 0.73             |
-| weighted      | 0.63  | 0.65             |
-| raw           | 0.63  | 0.65             |
-| weighted_norm | 0.62  | 0.64             |
-| ranked        | 0.33  | 0.28             |
+
+| Method          | Score    | Score (Normalized) |
+|---------------|----------|------------|
+| ranked_scaled | 0.629332 |       0.80 |
+| ranked_norm   | 0.654562 |       0.74 |
+| weighted      | 0.634804 |       0.65 |
+| raw           | 0.634528 |       0.65 |
+| weighted_norm | 0.623806 |       0.64 |
+| ranked        | 0.336333 |       0.28 |

 *Results for DeepSeek Chat V3 0324*

@@ -341,7 +341,7 @@ def compute_ranked_score(logp):
        # res = kendalltau(choices, logp_arr, variant='b')

        # lets just use the common numbers 1,3,5,7,9, as some models like to skip some
-        res = kendalltau(choices[1::2], logp_arr[1::2], variant='b')
+        res = kendalltau(choices, logp_arr, variant='b')
        # print(res.correlation, res.pvalue, res2.correlation, res2.pvalue)
        # correlation weighted by pvalue