results rom longer documents

2026-07-03 15:43:13 +08:00 · 2024-01-04 08:37:32 +08:00
parent 4f9dab2871
commit e2317f0232
6 changed files with 1578 additions and 16793 deletions
@@ -24,15 +24,23 @@ We found that this approach only works for model of sufficient ability. Phi-2 (2
 The model was fine-tuned on the first segment of various texts and then evaluated on the subsequent segment to measure the change in perplexity, which serves as a proxy for the text's learnability and predictability. Texts that exhibit high initial perplexity but show significant improvement are deemed to be both unpredictable and learnable, characteristics not typically associated with low-quality content.


-| name                                          |  before |   after | text length | improvement % | abs improvement | novel? | learnable? | High Quality?   |
-| :-------------------------------------------- | ------: | ------: | ----------: | ------------: | --------------: | :----- | :--------- | :---- |
-| wikipedia on LK-99                            |  32.219 | 28.8525 |        1038 |      0.104489 |         3.36652 | True   | True       | False |
-| good_ml                                       | 28.3473 | 26.4566 |        1004 |     0.0666997 |         1.89076 | True   | True       | False |
-| openai_board_ann                              |  15.904 | 15.1736 |        1191 |     0.0459214 |        0.730332 | True   | True       | True  |
-| Schmidhuber 2023 Subjective Novelty, Surprise |  29.615 | 28.4708 |        2654 |     0.0386353 |         1.14418 | True   | True       | False |
-| email_to_fauci                                | 25.0893 | 24.3714 |        1559 |     0.0286154 |        0.717941 | True   | True       | True  |
-| AI gen fake paper                             | 7.63283 | 7.57951 |        2031 |    0.00698672 |       0.0533285 | False  | False      | True  |
-| bad_ml                                        | 13.9061 | 13.8623 |        2345 |    0.00314972 |       0.0438004 | False  | False      | True  |
+| title                                     |    before |     after |   len |   improvement% |   improvement | novel   | learnable   | BS    |
+|:------------------------------------------|----------:|----------:|------:|---------------:|--------------:|:--------|:------------|:------|
+| cicero from ibois, Philippe (2012-06-03). |  72.5874  |  67.7442  | 13707 |     0.066722   |     4.84318   | True    | True        | False |
+| politics is the mind-killer               | 247.552   | 245.827   |  3158 |     0.00696722 |     1.72475   | True    | False       | False |
+| openai board ann                          |  55.8085  |  54.6679  |  2991 |     0.0204374  |     1.14058   | True    | True        | False |
+| How to Catch an AI Liar                   |  28.9499  |  28.0088  |  5464 |     0.0325069  |     0.941071  | True    | True        | True  |
+| buzzfeed foi fauci emails 2023            |  23.3094  |  22.4064  | 13640 |     0.0387411  |     0.903032  | True    | True        | True  |
+| Gemini to Q*                              |  11.7564  |  11.1906  | 42604 |     0.0481219  |     0.56574   | False   | True        | True  |
+| LK-99-en                                  |  14.5138  |  14.0661  | 15432 |     0.03085    |     0.447752  | False   | True        | True  |
+| LK-99-es                                  |  11.415   |  10.9729  | 12970 |     0.0387271  |     0.44207   | False   | True        | True  |
+| disney appointment                        | 118.826   | 118.42    |  3653 |     0.003417   |     0.406029  | True    | False       | True  |
+| weak to strong                            |  46.7642  |  46.4047  |  5811 |     0.00768638 |     0.359447  | True    | False       | True  |
+| blechley declaration                      |  17.8691  |  17.5242  |  7762 |     0.0193007  |     0.344887  | True    | False       | True  |
+| Lorem ipsum                               |   6.56484 |   6.26879 | 19649 |     0.0450961  |     0.296049  | False   | True        | True  |
+| statement by whitehouse on passing        |  29.1971  |  28.9397  |  1641 |     0.00881732 |     0.257441  | True    | False       | True  |
+| harvard announcment caplain israel hamas  |  45.3474  |  45.1273  |  4247 |     0.00485323 |     0.220081  | True    | False       | True  |
+| fake ai hoax paper                        |   7.76698 |   7.69723 |  3290 |     0.00898037 |     0.0697503 | False   | False       | True  |


 For instance, the Wikipedia extract on 'LK-99' demonstrates high initial perplexity and significant improvement, suggesting it is both novel and learnable—a hallmark of quality content. In contrast, texts like AI-generated papers, which show low perplexity or minimal improvement, are likely predictable or already within the model's training corpus, indicating lower quality.
@@ -1501,6 +1501,24 @@ cli = ["fire"]
 docs = ["requests (>=2.0.0)"]
 typing = ["mypy (>=1.0.0)", "types-setuptools"]

+[[package]]
+name = "loguru"
+version = "0.7.2"
+description = "Python logging made (stupidly) simple"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb"},
+    {file = "loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac"},
+]
+
+[package.dependencies]
+colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
+win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
+
+[package.extras]
+dev = ["Sphinx (==7.2.5)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.4.1)", "mypy (==v1.5.1)", "pre-commit (==3.4.0)", "pytest (==6.1.2)", "pytest (==7.4.0)", "pytest-cov (==2.12.1)", "pytest-cov (==4.1.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.0.0)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.3.0)", "tox (==3.27.1)", "tox (==4.11.0)"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.3"
@@ -3755,6 +3773,20 @@ files = [
    {file = "wcwidth-0.2.12.tar.gz", hash = "sha256:f01c104efdf57971bcb756f054dd58ddec5204dd15fa31d6503ea57947d97c02"},
 ]

+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+description = "A small Python utility to set file creation time on Windows"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
+    {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
+]
+
+[package.extras]
+dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
+
 [[package]]
 name = "xxhash"
 version = "3.4.1"
@@ -3978,4 +4010,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "2856534c176a36679a1d86e5fd77f1008b5084ee100b22102d64a7c71eff7448"
+content-hash = "82eec5aa54e741a26e88e96fb6a636dc0ec5dc91bf9a597801a70348de2fea5d"
@@ -28,6 +28,7 @@ tabulate = "^0.9.0"
 lightning = "^2.1.3"
 matplotlib = "^3.8.0"
 python-frontmatter = "^1.0.1"
+loguru = "^0.7.2"

 [[tool.poetry.source]]
 name = "pytorch"