mirror of
https://github.com/wassname/detect_bs_text.git
synced 2026-07-03 15:43:13 +08:00
results rom longer documents
This commit is contained in:
@@ -24,15 +24,23 @@ We found that this approach only works for model of sufficient ability. Phi-2 (2
|
||||
The model was fine-tuned on the first segment of various texts and then evaluated on the subsequent segment to measure the change in perplexity, which serves as a proxy for the text's learnability and predictability. Texts that exhibit high initial perplexity but show significant improvement are deemed to be both unpredictable and learnable, characteristics not typically associated with low-quality content.
|
||||
|
||||
|
||||
| name | before | after | text length | improvement % | abs improvement | novel? | learnable? | High Quality? |
|
||||
| :-------------------------------------------- | ------: | ------: | ----------: | ------------: | --------------: | :----- | :--------- | :---- |
|
||||
| wikipedia on LK-99 | 32.219 | 28.8525 | 1038 | 0.104489 | 3.36652 | True | True | False |
|
||||
| good_ml | 28.3473 | 26.4566 | 1004 | 0.0666997 | 1.89076 | True | True | False |
|
||||
| openai_board_ann | 15.904 | 15.1736 | 1191 | 0.0459214 | 0.730332 | True | True | True |
|
||||
| Schmidhuber 2023 Subjective Novelty, Surprise | 29.615 | 28.4708 | 2654 | 0.0386353 | 1.14418 | True | True | False |
|
||||
| email_to_fauci | 25.0893 | 24.3714 | 1559 | 0.0286154 | 0.717941 | True | True | True |
|
||||
| AI gen fake paper | 7.63283 | 7.57951 | 2031 | 0.00698672 | 0.0533285 | False | False | True |
|
||||
| bad_ml | 13.9061 | 13.8623 | 2345 | 0.00314972 | 0.0438004 | False | False | True |
|
||||
| title | before | after | len | improvement% | improvement | novel | learnable | BS |
|
||||
|:------------------------------------------|----------:|----------:|------:|---------------:|--------------:|:--------|:------------|:------|
|
||||
| cicero from ibois, Philippe (2012-06-03). | 72.5874 | 67.7442 | 13707 | 0.066722 | 4.84318 | True | True | False |
|
||||
| politics is the mind-killer | 247.552 | 245.827 | 3158 | 0.00696722 | 1.72475 | True | False | False |
|
||||
| openai board ann | 55.8085 | 54.6679 | 2991 | 0.0204374 | 1.14058 | True | True | False |
|
||||
| How to Catch an AI Liar | 28.9499 | 28.0088 | 5464 | 0.0325069 | 0.941071 | True | True | True |
|
||||
| buzzfeed foi fauci emails 2023 | 23.3094 | 22.4064 | 13640 | 0.0387411 | 0.903032 | True | True | True |
|
||||
| Gemini to Q* | 11.7564 | 11.1906 | 42604 | 0.0481219 | 0.56574 | False | True | True |
|
||||
| LK-99-en | 14.5138 | 14.0661 | 15432 | 0.03085 | 0.447752 | False | True | True |
|
||||
| LK-99-es | 11.415 | 10.9729 | 12970 | 0.0387271 | 0.44207 | False | True | True |
|
||||
| disney appointment | 118.826 | 118.42 | 3653 | 0.003417 | 0.406029 | True | False | True |
|
||||
| weak to strong | 46.7642 | 46.4047 | 5811 | 0.00768638 | 0.359447 | True | False | True |
|
||||
| blechley declaration | 17.8691 | 17.5242 | 7762 | 0.0193007 | 0.344887 | True | False | True |
|
||||
| Lorem ipsum | 6.56484 | 6.26879 | 19649 | 0.0450961 | 0.296049 | False | True | True |
|
||||
| statement by whitehouse on passing | 29.1971 | 28.9397 | 1641 | 0.00881732 | 0.257441 | True | False | True |
|
||||
| harvard announcment caplain israel hamas | 45.3474 | 45.1273 | 4247 | 0.00485323 | 0.220081 | True | False | True |
|
||||
| fake ai hoax paper | 7.76698 | 7.69723 | 3290 | 0.00898037 | 0.0697503 | False | False | True |
|
||||
|
||||
|
||||
For instance, the Wikipedia extract on 'LK-99' demonstrates high initial perplexity and significant improvement, suggesting it is both novel and learnable—a hallmark of quality content. In contrast, texts like AI-generated papers, which show low perplexity or minimal improvement, are likely predictable or already within the model's training corpus, indicating lower quality.
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Generated
+33
-1
@@ -1501,6 +1501,24 @@ cli = ["fire"]
|
||||
docs = ["requests (>=2.0.0)"]
|
||||
typing = ["mypy (>=1.0.0)", "types-setuptools"]
|
||||
|
||||
[[package]]
|
||||
name = "loguru"
|
||||
version = "0.7.2"
|
||||
description = "Python logging made (stupidly) simple"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
{file = "loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb"},
|
||||
{file = "loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
|
||||
win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["Sphinx (==7.2.5)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.4.1)", "mypy (==v1.5.1)", "pre-commit (==3.4.0)", "pytest (==6.1.2)", "pytest (==7.4.0)", "pytest-cov (==2.12.1)", "pytest-cov (==4.1.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.0.0)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.3.0)", "tox (==3.27.1)", "tox (==4.11.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "markupsafe"
|
||||
version = "2.1.3"
|
||||
@@ -3755,6 +3773,20 @@ files = [
|
||||
{file = "wcwidth-0.2.12.tar.gz", hash = "sha256:f01c104efdf57971bcb756f054dd58ddec5204dd15fa31d6503ea57947d97c02"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "win32-setctime"
|
||||
version = "1.1.0"
|
||||
description = "A small Python utility to set file creation time on Windows"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
{file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
|
||||
{file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "xxhash"
|
||||
version = "3.4.1"
|
||||
@@ -3978,4 +4010,4 @@ multidict = ">=4.0"
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "2856534c176a36679a1d86e5fd77f1008b5084ee100b22102d64a7c71eff7448"
|
||||
content-hash = "82eec5aa54e741a26e88e96fb6a636dc0ec5dc91bf9a597801a70348de2fea5d"
|
||||
|
||||
@@ -28,6 +28,7 @@ tabulate = "^0.9.0"
|
||||
lightning = "^2.1.3"
|
||||
matplotlib = "^3.8.0"
|
||||
python-frontmatter = "^1.0.1"
|
||||
loguru = "^0.7.2"
|
||||
|
||||
[[tool.poetry.source]]
|
||||
name = "pytorch"
|
||||
|
||||
Reference in New Issue
Block a user