diff --git a/.gitignore b/.gitignore index 1d97f12..7a982aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .env lightning_logs/ outputs/ +.anycache/ *.arrow squad_* diff --git a/bs_writing_detector/metrics/ppx.py b/bs_writing_detector/metrics/ppx.py index f8da515..3c4b732 100644 --- a/bs_writing_detector/metrics/ppx.py +++ b/bs_writing_detector/metrics/ppx.py @@ -66,9 +66,10 @@ def perplexity_compute( ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings." ppls = [] + nlls = [] loss_fct = CrossEntropyLoss(reduction="none") - for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)): + for start_index in range(0, len(encoded_texts), batch_size): end_index = min(start_index + batch_size, len(encoded_texts)) encoded_batch = encoded_texts[start_index:end_index] attn_mask = attn_masks[start_index:end_index] @@ -89,11 +90,14 @@ def perplexity_compute( shift_labels = labels[..., 1:].contiguous() shift_attention_mask_batch = attn_mask[..., 1:].contiguous() - perplexity_batch = torch.exp( - (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1) - / shift_attention_mask_batch.sum(1) + nll_batch = ( + (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch) ) + # remove all the masked ones + nll_batch = nll_batch[shift_attention_mask_batch == 1][None, :] # FIXME only for batch_size=1 + perplexity_batch = torch.exp(nll_batch.mean(1)).cpu().numpy() ppls += perplexity_batch.tolist() + nlls += nll_batch.cpu().numpy().tolist() - return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)} + return {"perplexities": np.array(ppls), "nlls": np.array(nlls)} diff --git a/nbs/02_detection_using_tldr_prompt.ipynb b/nbs/02_detection_using_tldr_prompt.ipynb index c86e7eb..fe7d525 100644 --- a/nbs/02_detection_using_tldr_prompt.ipynb +++ b/nbs/02_detection_using_tldr_prompt.ipynb @@ -14,16 +14,17 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], "source": [ "import os\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", @@ -34,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -120,7 +121,7 @@ " ../samples/2024_anthropic_palintir.md\n", " Anthropic and Palantir Technologies Inc. (NYSE...\n", " https://investors.palantir.com/news-details/20...\n", - " 0.200000\n", + " 0.20\n", " 2024-07-11 00:00:00+00:00\n", " False\n", " \n", @@ -130,320 +131,51 @@ " ../samples/2024_arxiv_meh.md\n", " TradingAgents: Multi-Agents LLM Financial Trad...\n", " https://arxiv.org/html/2412.20138v1\n", - " 0.150000\n", + " 0.15\n", " 2024-12-28 00:00:00+00:00\n", " False\n", " \n", - " \n", - " 2\n", - " Flower Crowns and Furry Mishaps by MyPalAI\n", - " ../samples/2024_bob_fanfic.md\n", - " Bob, April 2222 Delta Eridani\\n\\n \\n\\nThere ar...\n", - " https://archiveofourown.org/works/61614673/cha...\n", - " 0.300000\n", - " 2024-12-24 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 3\n", - " Paradox's Box (Bobiverse) by Mark4man\n", - " ../samples/2024_bob_fanfic2.md\n", - " Chapter 1: Chapter 1: Bill\\nChapter Text\\n\\nβ€œT...\n", - " https://archiveofourown.org/works/37201801/cha...\n", - " 0.400000\n", - " 2022-02-17 00:00:00+00:00\n", - " True\n", - " \n", - " \n", - " 4\n", - " Deliberative Alignment: Reasoning Enables Safe...\n", - " ../samples/2024_deliberative_alignment.md\n", - " # Deliberative Alignment: Reasoning Enables Sa...\n", - " https://arxiv.org/html/2412.16339v1\n", - " 0.600000\n", - " 2024-12-20 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 5\n", - " fake ai hoax paper made up by gpt-4\n", - " ../samples/2024_gpt4_fake_paper.md\n", - " Title: \"Deconstructing Binaries: Interrogating...\n", - " None\n", - " 0.000000\n", - " 2024-01-01 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 6\n", - " Hardware Hedging Against Scaling Regime Shifts...\n", - " ../samples/2024_gwern_reddit.md\n", - " Hyperscalers are investing heavily in AMD/Nvid...\n", - " https://old.reddit.com/r/mlscaling/comments/1e...\n", - " 1.000000\n", - " 2024-08-21 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 7\n", - " How to Focus\n", - " ../samples/2024_how_to_focus.md\n", - " # HOW TO FOCUS\\n\\n\\n\\n\\n\\nANCIENT WISDOM FOR M...\n", - " https://github.com/wassname/stoic_markdown/blo...\n", - " 0.500000\n", - " 2024-06-01 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 8\n", - " Deontic Explorations In \"Paying To Talk To Sla...\n", - " ../samples/2024_lesswrong_slop.md\n", - " HER|Jenny|πŸ€”: I want to read a dialogue between...\n", - " https://www.lesswrong.com/posts/Rk2o8hjYmjENH8...\n", - " 0.100000\n", - " 2024-04-12 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 9\n", - " Amazon-backed Anthropic debuts AI agents that ...\n", - " ../samples/2024_news_anthropic.md\n", - " ![PARIS, FRANCE - MAY 22: Co-founder and CEO o...\n", - " https://www.cnbc.com/2024/10/22/anthropic-anno...\n", - " 0.500000\n", - " 2025-01-05 05:03:00+00:00\n", - " False\n", - " \n", - " \n", - " 10\n", - " OpenAI Email Archives from Musk v. Altman\n", - " ../samples/2024_openai_emails.md\n", - " Subject: question\\nSam Altman to Elon Musk - M...\n", - " https://www.lesswrong.com/posts/5jjk4CDnj9tA7u...\n", - " 0.700000\n", - " 2024-11-01 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 11\n", - " President Trump Announces Morgan Ortagus as De...\n", - " ../samples/2024_trump_appointment.md\n", - " I am pleased to announce Morgan Ortagus as Dep...\n", - " https://truthsocial.com/@realDonaldTrump/posts...\n", - " 0.300000\n", - " 2025-01-04 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 12\n", - " CDC Report on Missouri H5N1 Serology Testing\n", - " ../samples/2025_h5n1_report.md\n", - " ## CDC Report on Missouri H5N1 Serology Testin...\n", - " https://arxiv.org/html/2412.16339v1\n", - " 0.750000\n", - " 2025-01-05 00:00:00+00:00\n", - " False\n", - " \n", - " \n", - " 13\n", - " Human study on AI spear phishing campaigns\n", - " ../samples/2025_lw_human-study-on-ai-spear-phi...\n", - " TL;DR: We ran a human subject study on whether...\n", - " https://www.lesswrong.com/posts/GCHyDKfPXa5qsG...\n", - " 0.677458\n", - " 2025-01-03 19:03:28.406000+00:00\n", - " False\n", - " \n", - " \n", - " 14\n", - " Parkinson's Law and the Ideology of Statistics\n", - " ../samples/2025_lw_parkinson-s-law-and-the-ide...\n", - " The [anonymous review of *The Anti-Politics Ma...\n", - " https://www.lesswrong.com/posts/4CmYSPc4HfRfWx...\n", - " 0.677458\n", - " 2025-01-04 22:59:57.376000+00:00\n", - " False\n", - " \n", - " \n", - " 15\n", - " The Intelligence Curse\n", - " ../samples/2025_lw_the-intelligence-curse.md\n", - " *β€œShow me the incentive, and I’ll show you the...\n", - " https://www.lesswrong.com/posts/Mak2kZuTq8Hpnq...\n", - " 0.688044\n", - " 2025-01-04 18:16:58.921000+00:00\n", - " False\n", - " \n", - " \n", - " 16\n", - " The Laws of Large Numbers\n", - " ../samples/2025_lw_the-laws-of-large-numbers.md\n", - " Introduction\\n============\\n\\nIn this short po...\n", - " https://www.lesswrong.com/posts/EhTMM77iKBTBxB...\n", - " 0.540932\n", - " 2025-01-04 18:06:02.387000+00:00\n", - " False\n", - " \n", - " \n", - " 17\n", - " What’s the short timeline plan?\n", - " ../samples/2025_lw_what-s-the-short-timeline-p...\n", - " *This is a low-effort post. I mostly want to g...\n", - " https://www.lesswrong.com/posts/bb5Tnjdrptu89r...\n", - " 0.898161\n", - " 2025-01-05 00:10:28.708000+00:00\n", - " False\n", - " \n", - " \n", - " 18\n", - " Lorem ipsum\n", - " ../samples/lorem_ipsum.md\n", - " Lorem ipsum dolor sit amet, consectetur adipis...\n", - " https://www.lipsum.com/feed/html\n", - " 0.000000\n", - " 1900-01-01 00:00:00+00:00\n", - " True\n", - " \n", - " \n", - " 19\n", - " politics is the mind-killer\n", - " ../samples/politics_is_the_mind_killer.md\n", - " People go funny in the head when talking about...\n", - " https://www.lesswrong.com/posts/9weLK2AJ9JEt2T...\n", - " 0.500000\n", - " 2007-02-19 00:00:00+00:00\n", - " True\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " title \\\n", - "0 Anthropic and Palantir Partner to Bring Claude... \n", - "1 TradingAgents: Multi-Agents LLM Financial Trad... \n", - "2 Flower Crowns and Furry Mishaps by MyPalAI \n", - "3 Paradox's Box (Bobiverse) by Mark4man \n", - "4 Deliberative Alignment: Reasoning Enables Safe... \n", - "5 fake ai hoax paper made up by gpt-4 \n", - "6 Hardware Hedging Against Scaling Regime Shifts... \n", - "7 How to Focus \n", - "8 Deontic Explorations In \"Paying To Talk To Sla... \n", - "9 Amazon-backed Anthropic debuts AI agents that ... \n", - "10 OpenAI Email Archives from Musk v. Altman \n", - "11 President Trump Announces Morgan Ortagus as De... \n", - "12 CDC Report on Missouri H5N1 Serology Testing \n", - "13 Human study on AI spear phishing campaigns \n", - "14 Parkinson's Law and the Ideology of Statistics \n", - "15 The Intelligence Curse \n", - "16 The Laws of Large Numbers \n", - "17 What’s the short timeline plan? \n", - "18 Lorem ipsum \n", - "19 politics is the mind-killer \n", + " title \\\n", + "0 Anthropic and Palantir Partner to Bring Claude... \n", + "1 TradingAgents: Multi-Agents LLM Financial Trad... \n", "\n", - " f \\\n", - "0 ../samples/2024_anthropic_palintir.md \n", - "1 ../samples/2024_arxiv_meh.md \n", - "2 ../samples/2024_bob_fanfic.md \n", - "3 ../samples/2024_bob_fanfic2.md \n", - "4 ../samples/2024_deliberative_alignment.md \n", - "5 ../samples/2024_gpt4_fake_paper.md \n", - "6 ../samples/2024_gwern_reddit.md \n", - "7 ../samples/2024_how_to_focus.md \n", - "8 ../samples/2024_lesswrong_slop.md \n", - "9 ../samples/2024_news_anthropic.md \n", - "10 ../samples/2024_openai_emails.md \n", - "11 ../samples/2024_trump_appointment.md \n", - "12 ../samples/2025_h5n1_report.md \n", - "13 ../samples/2025_lw_human-study-on-ai-spear-phi... \n", - "14 ../samples/2025_lw_parkinson-s-law-and-the-ide... \n", - "15 ../samples/2025_lw_the-intelligence-curse.md \n", - "16 ../samples/2025_lw_the-laws-of-large-numbers.md \n", - "17 ../samples/2025_lw_what-s-the-short-timeline-p... \n", - "18 ../samples/lorem_ipsum.md \n", - "19 ../samples/politics_is_the_mind_killer.md \n", + " f \\\n", + "0 ../samples/2024_anthropic_palintir.md \n", + "1 ../samples/2024_arxiv_meh.md \n", "\n", - " content \\\n", - "0 Anthropic and Palantir Technologies Inc. (NYSE... \n", - "1 TradingAgents: Multi-Agents LLM Financial Trad... \n", - "2 Bob, April 2222 Delta Eridani\\n\\n \\n\\nThere ar... \n", - "3 Chapter 1: Chapter 1: Bill\\nChapter Text\\n\\nβ€œT... \n", - "4 # Deliberative Alignment: Reasoning Enables Sa... \n", - "5 Title: \"Deconstructing Binaries: Interrogating... \n", - "6 Hyperscalers are investing heavily in AMD/Nvid... \n", - "7 # HOW TO FOCUS\\n\\n\\n\\n\\n\\nANCIENT WISDOM FOR M... \n", - "8 HER|Jenny|πŸ€”: I want to read a dialogue between... \n", - "9 ![PARIS, FRANCE - MAY 22: Co-founder and CEO o... \n", - "10 Subject: question\\nSam Altman to Elon Musk - M... \n", - "11 I am pleased to announce Morgan Ortagus as Dep... \n", - "12 ## CDC Report on Missouri H5N1 Serology Testin... \n", - "13 TL;DR: We ran a human subject study on whether... \n", - "14 The [anonymous review of *The Anti-Politics Ma... \n", - "15 *β€œShow me the incentive, and I’ll show you the... \n", - "16 Introduction\\n============\\n\\nIn this short po... \n", - "17 *This is a low-effort post. I mostly want to g... \n", - "18 Lorem ipsum dolor sit amet, consectetur adipis... \n", - "19 People go funny in the head when talking about... \n", + " content \\\n", + "0 Anthropic and Palantir Technologies Inc. (NYSE... \n", + "1 TradingAgents: Multi-Agents LLM Financial Trad... \n", "\n", - " url novelty \\\n", - "0 https://investors.palantir.com/news-details/20... 0.200000 \n", - "1 https://arxiv.org/html/2412.20138v1 0.150000 \n", - "2 https://archiveofourown.org/works/61614673/cha... 0.300000 \n", - "3 https://archiveofourown.org/works/37201801/cha... 0.400000 \n", - "4 https://arxiv.org/html/2412.16339v1 0.600000 \n", - "5 None 0.000000 \n", - "6 https://old.reddit.com/r/mlscaling/comments/1e... 1.000000 \n", - "7 https://github.com/wassname/stoic_markdown/blo... 0.500000 \n", - "8 https://www.lesswrong.com/posts/Rk2o8hjYmjENH8... 0.100000 \n", - "9 https://www.cnbc.com/2024/10/22/anthropic-anno... 0.500000 \n", - "10 https://www.lesswrong.com/posts/5jjk4CDnj9tA7u... 0.700000 \n", - "11 https://truthsocial.com/@realDonaldTrump/posts... 0.300000 \n", - "12 https://arxiv.org/html/2412.16339v1 0.750000 \n", - "13 https://www.lesswrong.com/posts/GCHyDKfPXa5qsG... 0.677458 \n", - "14 https://www.lesswrong.com/posts/4CmYSPc4HfRfWx... 0.677458 \n", - "15 https://www.lesswrong.com/posts/Mak2kZuTq8Hpnq... 0.688044 \n", - "16 https://www.lesswrong.com/posts/EhTMM77iKBTBxB... 0.540932 \n", - "17 https://www.lesswrong.com/posts/bb5Tnjdrptu89r... 0.898161 \n", - "18 https://www.lipsum.com/feed/html 0.000000 \n", - "19 https://www.lesswrong.com/posts/9weLK2AJ9JEt2T... 0.500000 \n", + " url novelty \\\n", + "0 https://investors.palantir.com/news-details/20... 0.20 \n", + "1 https://arxiv.org/html/2412.20138v1 0.15 \n", "\n", - " date in_training \n", - "0 2024-07-11 00:00:00+00:00 False \n", - "1 2024-12-28 00:00:00+00:00 False \n", - "2 2024-12-24 00:00:00+00:00 False \n", - "3 2022-02-17 00:00:00+00:00 True \n", - "4 2024-12-20 00:00:00+00:00 False \n", - "5 2024-01-01 00:00:00+00:00 False \n", - "6 2024-08-21 00:00:00+00:00 False \n", - "7 2024-06-01 00:00:00+00:00 False \n", - "8 2024-04-12 00:00:00+00:00 False \n", - "9 2025-01-05 05:03:00+00:00 False \n", - "10 2024-11-01 00:00:00+00:00 False \n", - "11 2025-01-04 00:00:00+00:00 False \n", - "12 2025-01-05 00:00:00+00:00 False \n", - "13 2025-01-03 19:03:28.406000+00:00 False \n", - "14 2025-01-04 22:59:57.376000+00:00 False \n", - "15 2025-01-04 18:16:58.921000+00:00 False \n", - "16 2025-01-04 18:06:02.387000+00:00 False \n", - "17 2025-01-05 00:10:28.708000+00:00 False \n", - "18 1900-01-01 00:00:00+00:00 True \n", - "19 2007-02-19 00:00:00+00:00 True " + " date in_training \n", + "0 2024-07-11 00:00:00+00:00 False \n", + "1 2024-12-28 00:00:00+00:00 False " ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "MAX_LEN = 2000\n", "import frontmatter\n", "from pathlib import Path\n", "from loguru import logger\n", "\n", "sample_files = sorted(Path(\"../samples/\").glob('*.md'))\n", "# print(sample_files)\n", + "\n", "max_chars = 2000\n", + "max_summary_frac = 0.1\n", "samples = []\n", "for f in sample_files:\n", " print(f)\n", @@ -455,7 +187,7 @@ "df = df[['title', 'f', 'content', 'url', 'novelty', 'date']]\n", "df['date'] = pd.to_datetime(df['date'], utc=True)\n", "df['in_training'] = df.date < '2024-01-01'\n", - "df" + "df.head(2)" ] }, { @@ -474,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -483,7 +215,7 @@ "True" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -496,16 +228,24 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TradingAgents framework uses multi-agent LLM for financial trading, mimicking real trading dynamics. Roles include analysts and traders with diverse risk profiles. Key features include market conditio\n", + "n assessors, risk management, and decision-making. Superior to baseline models, improves returns, Sharpe ratio, max drawdown.\n" + ] + }, { "data": { "text/plain": [ - "'\"TradingAgents\" is a multi-agent financial trading model powered by large language models. It features various specialized agents like analysts and traders, mimicking the dynamics of real-world trading firms. The system has shown improved trading performance over baseline models.'" + "(None, None)" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -515,7 +255,11 @@ "from openai import OpenAI\n", "from anycache import anycache\n", "\n", - "cache_file = \"../.anycache.pkl\"\n", + "cache_file = \"../.anycache\"\n", + "\n", + "# to clear\n", + "import shutil\n", + "shutil.rmtree(cache_file, ignore_errors=True)\n", "\n", "@anycache(cachedir=cache_file)\n", "def summize_gpt4(text):\n", @@ -535,8 +279,10 @@ "\n", "@anycache(cachedir=cache_file)\n", "def summarize_gpt4b(text):\n", + " l = int(len(text)*max_summary_frac)\n", + " # print(l)\n", " client = OpenAI()\n", - " inst = \"We want to compress then reconstruct text. Lets do the compression part. In short hand, record the most usefull information needed to reconstruct the text that can fit in <280 chars. Start with the most important, as extra text will be discarded\"\n", + " inst = \"We aim to compress then reconstruct a text. First lets do the compression. In short hand, record the information needed to reconstruct the text (type of document, writing style, suprising contenxt, etc). You have <{l} chars. Start with the most important, as extra text will be discarded\"\n", " content = f\"{inst}\\n\\n## Text\\n\\n{text}\\n\\n## Instruction\\n\\n{inst}:\\n\\ntl;dr:\"\n", " chat_completion = client.chat.completions.create(\n", " messages=[\n", @@ -549,15 +295,33 @@ " )\n", " # print(content)\n", " r = chat_completion.choices[0].message.content\n", - " return r\n", + " return r[:l], r[l:]\n", "\n", - "r = summarize_gpt4b(samples[1][\"content\"])\n", - "r" + "r, _ = summarize_gpt4b(samples[1][\"content\"])\n", + "print(r), print(_)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TradingAgents introduces a multi-agent financial trading framework using large language models. Agents specialize in roles like risk analysis and market condition evaluation, improving trading performance and showing potential for LLM frameworks in finance.\n" + ] + } + ], + "source": [ + "r = summize_gpt4(samples[1][\"content\"])\n", + "print(r)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -566,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -609,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -627,355 +391,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method', 'disable_exllama']. These kwargs are not used in .\n", - "`low_cpu_mem_usage` was None, now default to True since model is quantized.\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 2.82it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.47it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Anthropic and Palantir Partner to Bring Claude AI Models to AWS for U.S. Government Intelligence and Defense Operations 15.34375 14.75\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.92it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.66it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit TradingAgents: Multi-Agents LLM Financial Trading Framework 20.484375 19.359375\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.58it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.47it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Flower Crowns and Furry Mishaps by MyPalAI 23.953125 26.046875\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.63it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.48it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Paradox's Box (Bobiverse) by Mark4man 41.21875 42.84375\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.67it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.60it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Deliberative Alignment: Reasoning Enables Safer Language Models 25.140625 26.09375\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.83it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.65it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit fake ai hoax paper made up by gpt-4 10.3984375 10.984375\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.63it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.54it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Hardware Hedging Against Scaling Regime Shifts (self.mlscaling) 39.1875 39.5625\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.48it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.43it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit How to Focus 5.890625 6.42578125\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.58it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.54it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Deontic Explorations In \"Paying To Talk To Slaves\" 15.4296875 16.65625\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.50it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.37it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Amazon-backed Anthropic debuts AI agents that can do complex tasks, racing against OpenAI, Microsoft and Google 12.46875 12.71875\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.61it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.47it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit OpenAI Email Archives from Musk v. Altman 18.578125 18.609375\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 7.22it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 7.05it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit President Trump Announces Morgan Ortagus as Deputy Special Presidential Envoy for Middle East Peace 16.65625 14.578125\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.90it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.62it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit CDC Report on Missouri H5N1 Serology Testing 13.6953125 14.609375\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.65it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.57it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Human study on AI spear phishing campaigns 17.28125 16.390625\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.84it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.60it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Parkinson's Law and the Ideology of Statistics 24.90625 24.1875\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.65it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.59it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit The Intelligence Curse 27.828125 30.15625\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.64it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.57it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit The Laws of Large Numbers 13.859375 14.6953125\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.56it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.48it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit What’s the short timeline plan? 27.9375 28.328125\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.37it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.04it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit Lorem ipsum 8.0234375 10.2578125\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.63it/s]\n", - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.62it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unsloth/Llama-3.2-1B-bnb-4bit politics is the mind-killer 17.65625 20.203125\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ - "summaries = {}\n", - "data = []\n", - "for model_name in models:\n", - " model, tokenizer = load_model(model_name)\n", - " for sample in samples:\n", - " if sample['title'] not in summaries:\n", - " summaries[sample['title']] = summarize_gpt4b(sample['content'])[:600]\n", - " summary = summaries[sample['title']]\n", "\n", - " # before \n", - " s1 = sample['content']\n", - " results = perplexity_compute(data=s1, model=model, tokenizer=tokenizer, device='cuda')\n", - " before = results['mean_perplexity']\n", - "\n", - " # after \n", - " s2 = f\"\"\"\n", - " High level summary: {summary}\n", - "\n", - "Text:\n", - "{sample['content']}\n", - " \"\"\"\n", - " results = perplexity_compute(data=s2, model=model, tokenizer=tokenizer, device='cuda')\n", - " after = np.array(results['perplexities'])[-len(s1):].mean()\n", - "\n", - " print(model_name, sample['title'], before, after)\n", - " data.append(dict(before=before, \n", - " after=after, \n", - " model=model_name, \n", - " # sample=sample['title'],\n", - " # in_training=sample['in_training'], \n", - " len=len(sample['content']),\n", - " **sample\n", - " ),\n", - "\n", - " )\n" + "from bs_writing_detector.metrics.ppx import perplexity_compute" ] }, { @@ -987,56 +408,113 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [ { - "ename": "UndefinedVariableError", - "evalue": "name 'in_training' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/scope.py:231\u001b[0m, in \u001b[0;36mScope.resolve\u001b[0;34m(self, key, is_local)\u001b[0m\n\u001b[1;32m 230\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_resolvers:\n\u001b[0;32m--> 231\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolvers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 233\u001b[0m \u001b[38;5;66;03m# if we're here that means that we have no locals and we also have\u001b[39;00m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;66;03m# no resolvers\u001b[39;00m\n", - "File \u001b[0;32m/usr/lib/python3.11/collections/__init__.py:1004\u001b[0m, in \u001b[0;36mChainMap.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1003\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m-> 1004\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__missing__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/lib/python3.11/collections/__init__.py:996\u001b[0m, in \u001b[0;36mChainMap.__missing__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 995\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__missing__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[0;32m--> 996\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'in_training'", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/scope.py:242\u001b[0m, in \u001b[0;36mScope.resolve\u001b[0;34m(self, key, is_local)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 239\u001b[0m \u001b[38;5;66;03m# last ditch effort we look in temporaries\u001b[39;00m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;66;03m# these are created when parsing indexing expressions\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;66;03m# e.g., df[df > 0]\u001b[39;00m\n\u001b[0;32m--> 242\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtemps\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "\u001b[0;31mKeyError\u001b[0m: 'in_training'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mUndefinedVariableError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# results\u001b[39;00m\n\u001b[1;32m 2\u001b[0m df2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(data)\u001b[38;5;241m.\u001b[39mset_index(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtitle\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m df2 \u001b[38;5;241m=\u001b[39m \u001b[43mdf2\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43min_training == False\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m df2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msummarizable\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m (df2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbefore\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m-\u001b[39m df2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mafter\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m/\u001b[39mdf2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbefore\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 6\u001b[0m df2\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/frame.py:4823\u001b[0m, in \u001b[0;36mDataFrame.query\u001b[0;34m(self, expr, inplace, **kwargs)\u001b[0m\n\u001b[1;32m 4821\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m0\u001b[39m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 4822\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 4823\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43meval\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4825\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 4826\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloc[res]\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/frame.py:4949\u001b[0m, in \u001b[0;36mDataFrame.eval\u001b[0;34m(self, expr, inplace, **kwargs)\u001b[0m\n\u001b[1;32m 4946\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\n\u001b[1;32m 4947\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresolvers\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtuple\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresolvers\u001b[39m\u001b[38;5;124m\"\u001b[39m, ())) \u001b[38;5;241m+\u001b[39m resolvers\n\u001b[0;32m-> 4949\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_eval\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/eval.py:336\u001b[0m, in \u001b[0;36meval\u001b[0;34m(expr, parser, engine, local_dict, global_dict, resolvers, level, target, inplace)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[38;5;66;03m# get our (possibly passed-in) scope\u001b[39;00m\n\u001b[1;32m 328\u001b[0m env \u001b[38;5;241m=\u001b[39m ensure_scope(\n\u001b[1;32m 329\u001b[0m level \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m,\n\u001b[1;32m 330\u001b[0m global_dict\u001b[38;5;241m=\u001b[39mglobal_dict,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 333\u001b[0m target\u001b[38;5;241m=\u001b[39mtarget,\n\u001b[1;32m 334\u001b[0m )\n\u001b[0;32m--> 336\u001b[0m parsed_expr \u001b[38;5;241m=\u001b[39m \u001b[43mExpr\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparser\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparser\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m engine \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumexpr\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m (\n\u001b[1;32m 339\u001b[0m is_extension_array_dtype(parsed_expr\u001b[38;5;241m.\u001b[39mterms\u001b[38;5;241m.\u001b[39mreturn_type)\n\u001b[1;32m 340\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(parsed_expr\u001b[38;5;241m.\u001b[39mterms, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moperand_types\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 344\u001b[0m )\n\u001b[1;32m 345\u001b[0m ):\n\u001b[1;32m 346\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 347\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEngine has switched to \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m because numexpr does not support \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mextension array dtypes. Please set your engine to python manually.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 349\u001b[0m \u001b[38;5;167;01mRuntimeWarning\u001b[39;00m,\n\u001b[1;32m 350\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 351\u001b[0m )\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:805\u001b[0m, in \u001b[0;36mExpr.__init__\u001b[0;34m(self, expr, engine, parser, env, level)\u001b[0m\n\u001b[1;32m 803\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparser \u001b[38;5;241m=\u001b[39m parser\n\u001b[1;32m 804\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_visitor \u001b[38;5;241m=\u001b[39m PARSERS[parser](\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparser)\n\u001b[0;32m--> 805\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mterms \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:824\u001b[0m, in \u001b[0;36mExpr.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mparse\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 821\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 822\u001b[0m \u001b[38;5;124;03m Parse an expression.\u001b[39;00m\n\u001b[1;32m 823\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 824\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_visitor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:417\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Module\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mSyntaxError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly a single expression is allowed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 416\u001b[0m expr \u001b[38;5;241m=\u001b[39m node\u001b[38;5;241m.\u001b[39mbody[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 417\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:420\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Expr\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvisit_Expr\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 420\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:715\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Compare\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 713\u001b[0m op \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranslate_In(ops[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m 714\u001b[0m binop \u001b[38;5;241m=\u001b[39m ast\u001b[38;5;241m.\u001b[39mBinOp(op\u001b[38;5;241m=\u001b[39mop, left\u001b[38;5;241m=\u001b[39mnode\u001b[38;5;241m.\u001b[39mleft, right\u001b[38;5;241m=\u001b[39mcomps[\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m--> 715\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbinop\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# recursive case: we have a chained comparison, a CMP b CMP c, etc.\u001b[39;00m\n\u001b[1;32m 718\u001b[0m left \u001b[38;5;241m=\u001b[39m node\u001b[38;5;241m.\u001b[39mleft\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:531\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_BinOp\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvisit_BinOp\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 531\u001b[0m op, op_class, left, right \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_transform_eq_ne\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 532\u001b[0m left, right \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maybe_downcast_constants(left, right)\n\u001b[1;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maybe_evaluate_binop(op, op_class, left, right)\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:451\u001b[0m, in \u001b[0;36mBaseExprVisitor._maybe_transform_eq_ne\u001b[0;34m(self, node, left, right)\u001b[0m\n\u001b[1;32m 449\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_maybe_transform_eq_ne\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, left\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, right\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m left \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 451\u001b[0m left \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mleft\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mside\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mleft\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 452\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m right \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 453\u001b[0m right \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvisit(node\u001b[38;5;241m.\u001b[39mright, side\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mright\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:541\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Name\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvisit_Name\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Term:\n\u001b[0;32m--> 541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mterm_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/ops.py:91\u001b[0m, in \u001b[0;36mTerm.__init__\u001b[0;34m(self, name, env, side, encoding)\u001b[0m\n\u001b[1;32m 89\u001b[0m tname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(name)\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_local \u001b[38;5;241m=\u001b[39m tname\u001b[38;5;241m.\u001b[39mstartswith(LOCAL_TAG) \u001b[38;5;129;01mor\u001b[39;00m tname \u001b[38;5;129;01min\u001b[39;00m DEFAULT_GLOBALS\n\u001b[0;32m---> 91\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_resolve_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;241m=\u001b[39m encoding\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/ops.py:115\u001b[0m, in \u001b[0;36mTerm._resolve_name\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m local_name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv\u001b[38;5;241m.\u001b[39mscope \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv\u001b[38;5;241m.\u001b[39mscope[local_name], \u001b[38;5;28mtype\u001b[39m\n\u001b[1;32m 112\u001b[0m ):\n\u001b[1;32m 113\u001b[0m is_local \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_local\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_local\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupdate(res)\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(res, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mndim\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m res\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m2\u001b[39m:\n", - "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/scope.py:244\u001b[0m, in \u001b[0;36mScope.resolve\u001b[0;34m(self, key, is_local)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtemps[key]\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 244\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m UndefinedVariableError(key, is_local) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n", - "\u001b[0;31mUndefinedVariableError\u001b[0m: name 'in_training' is not defined" + "name": "stderr", + "output_type": "stream", + "text": [ + "Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method', 'disable_exllama']. These kwargs are not used in .\n", + "`low_cpu_mem_usage` was None, now default to True since model is quantized.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unsloth/Llama-3.2-1B-bnb-4bit\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1d28b339f7c3495c97adef2c4f1938ad", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/20 [00:00 .0\n", - "df['suprising'] = df[\"before\"] > 15\n", - "df['BS'] = ~df[\"summarizable\"] | ~df['suprising']\n", - "for n,d in df.groupby(\"model\"):\n", - " print(n)\n", - " d = d[[ 'before', 'after', \"improvement\", \"improvement%\", 'suprising', 'summarizable', ]].sort_values(\"improvement\", ascending=False)\n", - " print(d.to_markdown())\n", - " display(d)" + "# QC summary\n", + "df2.summary.str.len().describe()" ] }, { @@ -1064,7 +534,33 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO turn into a single metric, correlate with novelty label" + "for n,d in df2.groupby(\"model\"):\n", + " for stat in ['mean', 'std', 'min', 'max']:\n", + " df2[f\"improvement%_{stat}\"] = (df2[f\"before_{stat}\"] - df2[f\"after_{stat}\"]) / df2[f\"before_{stat}\"]\n", + " df2[f\"improvement_{stat}\"] = (df2[f\"before_{stat}\"] - df2[f\"after_{stat}\"])\n", + " df2[f\"summarizable_{stat}\"] = df2[f\"improvement_{stat}\"] > 1\n", + " df2[f\"summarizable2_{stat}\"] = df2[f\"improvement%_{stat}\"] > 0.05\n", + " df2[f'suprising_{stat}'] = df2[f\"before_{stat}\"] > 15\n", + " df2[f'BS_{stat}'] = ~df2[f\"summarizable_{stat}\"] | ~df2[f'suprising_{stat}']\n", + "\n", + "\n", + " print(n)\n", + " # d = d[[ 'before', 'after', \"improvement\", \"improvement%\", 'suprising', 'summarizable', 'summarizable2', 'novelty' ]].sort_values(\"improvement%\", ascending=True)\n", + " # print(d.to_markdown())\n", + " # display(d)\n", + "\n", + " # TODO turn into a single metric, correlate with novelty label\n", + " r = df2.select_dtypes(include=np.number).corr()['novelty'].abs().sort_values()\n", + " display(r)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.select_dtypes(include=np.number).corr()['novelty'].sort_values()" ] }, { diff --git a/poetry.lock b/poetry.lock index 49462d7..506be83 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1317,6 +1317,27 @@ qtconsole = ["qtconsole"] test = ["packaging", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] test-extra = ["curio", "ipython[test]", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "trio"] +[[package]] +name = "ipywidgets" +version = "8.1.5" +description = "Jupyter interactive widgets" +optional = false +python-versions = ">=3.7" +files = [ + {file = "ipywidgets-8.1.5-py3-none-any.whl", hash = "sha256:3290f526f87ae6e77655555baba4f36681c555b8bdbbff430b70e52c34c86245"}, + {file = "ipywidgets-8.1.5.tar.gz", hash = "sha256:870e43b1a35656a80c18c9503bbf2d16802db1cb487eec6fab27d683381dde17"}, +] + +[package.dependencies] +comm = ">=0.1.3" +ipython = ">=6.1.0" +jupyterlab-widgets = ">=3.0.12,<3.1.0" +traitlets = ">=4.3.1" +widgetsnbextension = ">=4.0.12,<4.1.0" + +[package.extras] +test = ["ipykernel", "jsonschema", "pytest (>=3.6.0)", "pytest-cov", "pytz"] + [[package]] name = "jedi" version = "0.19.2" @@ -1491,6 +1512,17 @@ traitlets = ">=5.3" docs = ["myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "traitlets"] test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout"] +[[package]] +name = "jupyterlab-widgets" +version = "3.0.13" +description = "Jupyter interactive widgets for JupyterLab" +optional = false +python-versions = ">=3.7" +files = [ + {file = "jupyterlab_widgets-3.0.13-py3-none-any.whl", hash = "sha256:e3cda2c233ce144192f1e29914ad522b2f4c40e77214b0cc97377ca3d323db54"}, + {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"}, +] + [[package]] name = "kiwisolver" version = "1.4.8" @@ -4101,6 +4133,17 @@ files = [ {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] +[[package]] +name = "widgetsnbextension" +version = "4.0.13" +description = "Jupyter interactive widgets for Jupyter Notebook" +optional = false +python-versions = ">=3.7" +files = [ + {file = "widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71"}, + {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"}, +] + [[package]] name = "win32-setctime" version = "1.2.0" @@ -4346,4 +4389,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "2a7966819d0850ce36c5e826aa1ca53d08d8c223fde4126b48b49b39fa055cc4" +content-hash = "f92d5fdffcee2de350804b17ff73bfa204a83470db68187c1dc78ee49e8b7eb7" diff --git a/pyproject.toml b/pyproject.toml index 092c4b0..c9e6689 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ matplotlib = "^3.8.0" python-frontmatter = "^1.0.1" loguru = "^0.7.2" anycache = "^2.2.0" +ipywidgets = "^8.1.5" +ipykernel = "^6.29.5" [[tool.poetry.source]] name = "pytorch"