diff --git a/.gitignore b/.gitignore
index 1d97f12..7a982aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
.env
lightning_logs/
outputs/
+.anycache/
*.arrow
squad_*
diff --git a/bs_writing_detector/metrics/ppx.py b/bs_writing_detector/metrics/ppx.py
index f8da515..3c4b732 100644
--- a/bs_writing_detector/metrics/ppx.py
+++ b/bs_writing_detector/metrics/ppx.py
@@ -66,9 +66,10 @@ def perplexity_compute(
), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
ppls = []
+ nlls = []
loss_fct = CrossEntropyLoss(reduction="none")
- for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
+ for start_index in range(0, len(encoded_texts), batch_size):
end_index = min(start_index + batch_size, len(encoded_texts))
encoded_batch = encoded_texts[start_index:end_index]
attn_mask = attn_masks[start_index:end_index]
@@ -89,11 +90,14 @@ def perplexity_compute(
shift_labels = labels[..., 1:].contiguous()
shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
- perplexity_batch = torch.exp(
- (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
- / shift_attention_mask_batch.sum(1)
+ nll_batch = (
+ (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch)
)
+ # remove all the masked ones
+ nll_batch = nll_batch[shift_attention_mask_batch == 1][None, :] # FIXME only for batch_size=1
+ perplexity_batch = torch.exp(nll_batch.mean(1)).cpu().numpy()
ppls += perplexity_batch.tolist()
+ nlls += nll_batch.cpu().numpy().tolist()
- return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}
+ return {"perplexities": np.array(ppls), "nlls": np.array(nlls)}
diff --git a/nbs/02_detection_using_tldr_prompt.ipynb b/nbs/02_detection_using_tldr_prompt.ipynb
index c86e7eb..fe7d525 100644
--- a/nbs/02_detection_using_tldr_prompt.ipynb
+++ b/nbs/02_detection_using_tldr_prompt.ipynb
@@ -14,16 +14,17 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n"
- ]
- }
- ],
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
"source": [
"import os\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
@@ -34,7 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -54,7 +55,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -120,7 +121,7 @@
"
../samples/2024_anthropic_palintir.md | \n",
" Anthropic and Palantir Technologies Inc. (NYSE... | \n",
" https://investors.palantir.com/news-details/20... | \n",
- " 0.200000 | \n",
+ " 0.20 | \n",
" 2024-07-11 00:00:00+00:00 | \n",
" False | \n",
" \n",
@@ -130,320 +131,51 @@
" ../samples/2024_arxiv_meh.md | \n",
" TradingAgents: Multi-Agents LLM Financial Trad... | \n",
" https://arxiv.org/html/2412.20138v1 | \n",
- " 0.150000 | \n",
+ " 0.15 | \n",
" 2024-12-28 00:00:00+00:00 | \n",
" False | \n",
" \n",
- " \n",
- " | 2 | \n",
- " Flower Crowns and Furry Mishaps by MyPalAI | \n",
- " ../samples/2024_bob_fanfic.md | \n",
- " Bob, April 2222 Delta Eridani\\n\\n \\n\\nThere ar... | \n",
- " https://archiveofourown.org/works/61614673/cha... | \n",
- " 0.300000 | \n",
- " 2024-12-24 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Paradox's Box (Bobiverse) by Mark4man | \n",
- " ../samples/2024_bob_fanfic2.md | \n",
- " Chapter 1: Chapter 1: Bill\\nChapter Text\\n\\nβT... | \n",
- " https://archiveofourown.org/works/37201801/cha... | \n",
- " 0.400000 | \n",
- " 2022-02-17 00:00:00+00:00 | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Deliberative Alignment: Reasoning Enables Safe... | \n",
- " ../samples/2024_deliberative_alignment.md | \n",
- " # Deliberative Alignment: Reasoning Enables Sa... | \n",
- " https://arxiv.org/html/2412.16339v1 | \n",
- " 0.600000 | \n",
- " 2024-12-20 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " fake ai hoax paper made up by gpt-4 | \n",
- " ../samples/2024_gpt4_fake_paper.md | \n",
- " Title: \"Deconstructing Binaries: Interrogating... | \n",
- " None | \n",
- " 0.000000 | \n",
- " 2024-01-01 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " Hardware Hedging Against Scaling Regime Shifts... | \n",
- " ../samples/2024_gwern_reddit.md | \n",
- " Hyperscalers are investing heavily in AMD/Nvid... | \n",
- " https://old.reddit.com/r/mlscaling/comments/1e... | \n",
- " 1.000000 | \n",
- " 2024-08-21 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " How to Focus | \n",
- " ../samples/2024_how_to_focus.md | \n",
- " # HOW TO FOCUS\\n\\n\\n\\n\\n\\nANCIENT WISDOM FOR M... | \n",
- " https://github.com/wassname/stoic_markdown/blo... | \n",
- " 0.500000 | \n",
- " 2024-06-01 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " Deontic Explorations In \"Paying To Talk To Sla... | \n",
- " ../samples/2024_lesswrong_slop.md | \n",
- " HER|Jenny|π€: I want to read a dialogue between... | \n",
- " https://www.lesswrong.com/posts/Rk2o8hjYmjENH8... | \n",
- " 0.100000 | \n",
- " 2024-04-12 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " Amazon-backed Anthropic debuts AI agents that ... | \n",
- " ../samples/2024_news_anthropic.md | \n",
- " ![PARIS, FRANCE - MAY 22: Co-founder and CEO o... | \n",
- " https://www.cnbc.com/2024/10/22/anthropic-anno... | \n",
- " 0.500000 | \n",
- " 2025-01-05 05:03:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 10 | \n",
- " OpenAI Email Archives from Musk v. Altman | \n",
- " ../samples/2024_openai_emails.md | \n",
- " Subject: question\\nSam Altman to Elon Musk - M... | \n",
- " https://www.lesswrong.com/posts/5jjk4CDnj9tA7u... | \n",
- " 0.700000 | \n",
- " 2024-11-01 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 11 | \n",
- " President Trump Announces Morgan Ortagus as De... | \n",
- " ../samples/2024_trump_appointment.md | \n",
- " I am pleased to announce Morgan Ortagus as Dep... | \n",
- " https://truthsocial.com/@realDonaldTrump/posts... | \n",
- " 0.300000 | \n",
- " 2025-01-04 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 12 | \n",
- " CDC Report on Missouri H5N1 Serology Testing | \n",
- " ../samples/2025_h5n1_report.md | \n",
- " ## CDC Report on Missouri H5N1 Serology Testin... | \n",
- " https://arxiv.org/html/2412.16339v1 | \n",
- " 0.750000 | \n",
- " 2025-01-05 00:00:00+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 13 | \n",
- " Human study on AI spear phishing campaigns | \n",
- " ../samples/2025_lw_human-study-on-ai-spear-phi... | \n",
- " TL;DR: We ran a human subject study on whether... | \n",
- " https://www.lesswrong.com/posts/GCHyDKfPXa5qsG... | \n",
- " 0.677458 | \n",
- " 2025-01-03 19:03:28.406000+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 14 | \n",
- " Parkinson's Law and the Ideology of Statistics | \n",
- " ../samples/2025_lw_parkinson-s-law-and-the-ide... | \n",
- " The [anonymous review of *The Anti-Politics Ma... | \n",
- " https://www.lesswrong.com/posts/4CmYSPc4HfRfWx... | \n",
- " 0.677458 | \n",
- " 2025-01-04 22:59:57.376000+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 15 | \n",
- " The Intelligence Curse | \n",
- " ../samples/2025_lw_the-intelligence-curse.md | \n",
- " *βShow me the incentive, and Iβll show you the... | \n",
- " https://www.lesswrong.com/posts/Mak2kZuTq8Hpnq... | \n",
- " 0.688044 | \n",
- " 2025-01-04 18:16:58.921000+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 16 | \n",
- " The Laws of Large Numbers | \n",
- " ../samples/2025_lw_the-laws-of-large-numbers.md | \n",
- " Introduction\\n============\\n\\nIn this short po... | \n",
- " https://www.lesswrong.com/posts/EhTMM77iKBTBxB... | \n",
- " 0.540932 | \n",
- " 2025-01-04 18:06:02.387000+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 17 | \n",
- " Whatβs the short timeline plan? | \n",
- " ../samples/2025_lw_what-s-the-short-timeline-p... | \n",
- " *This is a low-effort post. I mostly want to g... | \n",
- " https://www.lesswrong.com/posts/bb5Tnjdrptu89r... | \n",
- " 0.898161 | \n",
- " 2025-01-05 00:10:28.708000+00:00 | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 18 | \n",
- " Lorem ipsum | \n",
- " ../samples/lorem_ipsum.md | \n",
- " Lorem ipsum dolor sit amet, consectetur adipis... | \n",
- " https://www.lipsum.com/feed/html | \n",
- " 0.000000 | \n",
- " 1900-01-01 00:00:00+00:00 | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 19 | \n",
- " politics is the mind-killer | \n",
- " ../samples/politics_is_the_mind_killer.md | \n",
- " People go funny in the head when talking about... | \n",
- " https://www.lesswrong.com/posts/9weLK2AJ9JEt2T... | \n",
- " 0.500000 | \n",
- " 2007-02-19 00:00:00+00:00 | \n",
- " True | \n",
- "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " title \\\n",
- "0 Anthropic and Palantir Partner to Bring Claude... \n",
- "1 TradingAgents: Multi-Agents LLM Financial Trad... \n",
- "2 Flower Crowns and Furry Mishaps by MyPalAI \n",
- "3 Paradox's Box (Bobiverse) by Mark4man \n",
- "4 Deliberative Alignment: Reasoning Enables Safe... \n",
- "5 fake ai hoax paper made up by gpt-4 \n",
- "6 Hardware Hedging Against Scaling Regime Shifts... \n",
- "7 How to Focus \n",
- "8 Deontic Explorations In \"Paying To Talk To Sla... \n",
- "9 Amazon-backed Anthropic debuts AI agents that ... \n",
- "10 OpenAI Email Archives from Musk v. Altman \n",
- "11 President Trump Announces Morgan Ortagus as De... \n",
- "12 CDC Report on Missouri H5N1 Serology Testing \n",
- "13 Human study on AI spear phishing campaigns \n",
- "14 Parkinson's Law and the Ideology of Statistics \n",
- "15 The Intelligence Curse \n",
- "16 The Laws of Large Numbers \n",
- "17 Whatβs the short timeline plan? \n",
- "18 Lorem ipsum \n",
- "19 politics is the mind-killer \n",
+ " title \\\n",
+ "0 Anthropic and Palantir Partner to Bring Claude... \n",
+ "1 TradingAgents: Multi-Agents LLM Financial Trad... \n",
"\n",
- " f \\\n",
- "0 ../samples/2024_anthropic_palintir.md \n",
- "1 ../samples/2024_arxiv_meh.md \n",
- "2 ../samples/2024_bob_fanfic.md \n",
- "3 ../samples/2024_bob_fanfic2.md \n",
- "4 ../samples/2024_deliberative_alignment.md \n",
- "5 ../samples/2024_gpt4_fake_paper.md \n",
- "6 ../samples/2024_gwern_reddit.md \n",
- "7 ../samples/2024_how_to_focus.md \n",
- "8 ../samples/2024_lesswrong_slop.md \n",
- "9 ../samples/2024_news_anthropic.md \n",
- "10 ../samples/2024_openai_emails.md \n",
- "11 ../samples/2024_trump_appointment.md \n",
- "12 ../samples/2025_h5n1_report.md \n",
- "13 ../samples/2025_lw_human-study-on-ai-spear-phi... \n",
- "14 ../samples/2025_lw_parkinson-s-law-and-the-ide... \n",
- "15 ../samples/2025_lw_the-intelligence-curse.md \n",
- "16 ../samples/2025_lw_the-laws-of-large-numbers.md \n",
- "17 ../samples/2025_lw_what-s-the-short-timeline-p... \n",
- "18 ../samples/lorem_ipsum.md \n",
- "19 ../samples/politics_is_the_mind_killer.md \n",
+ " f \\\n",
+ "0 ../samples/2024_anthropic_palintir.md \n",
+ "1 ../samples/2024_arxiv_meh.md \n",
"\n",
- " content \\\n",
- "0 Anthropic and Palantir Technologies Inc. (NYSE... \n",
- "1 TradingAgents: Multi-Agents LLM Financial Trad... \n",
- "2 Bob, April 2222 Delta Eridani\\n\\n \\n\\nThere ar... \n",
- "3 Chapter 1: Chapter 1: Bill\\nChapter Text\\n\\nβT... \n",
- "4 # Deliberative Alignment: Reasoning Enables Sa... \n",
- "5 Title: \"Deconstructing Binaries: Interrogating... \n",
- "6 Hyperscalers are investing heavily in AMD/Nvid... \n",
- "7 # HOW TO FOCUS\\n\\n\\n\\n\\n\\nANCIENT WISDOM FOR M... \n",
- "8 HER|Jenny|π€: I want to read a dialogue between... \n",
- "9 ![PARIS, FRANCE - MAY 22: Co-founder and CEO o... \n",
- "10 Subject: question\\nSam Altman to Elon Musk - M... \n",
- "11 I am pleased to announce Morgan Ortagus as Dep... \n",
- "12 ## CDC Report on Missouri H5N1 Serology Testin... \n",
- "13 TL;DR: We ran a human subject study on whether... \n",
- "14 The [anonymous review of *The Anti-Politics Ma... \n",
- "15 *βShow me the incentive, and Iβll show you the... \n",
- "16 Introduction\\n============\\n\\nIn this short po... \n",
- "17 *This is a low-effort post. I mostly want to g... \n",
- "18 Lorem ipsum dolor sit amet, consectetur adipis... \n",
- "19 People go funny in the head when talking about... \n",
+ " content \\\n",
+ "0 Anthropic and Palantir Technologies Inc. (NYSE... \n",
+ "1 TradingAgents: Multi-Agents LLM Financial Trad... \n",
"\n",
- " url novelty \\\n",
- "0 https://investors.palantir.com/news-details/20... 0.200000 \n",
- "1 https://arxiv.org/html/2412.20138v1 0.150000 \n",
- "2 https://archiveofourown.org/works/61614673/cha... 0.300000 \n",
- "3 https://archiveofourown.org/works/37201801/cha... 0.400000 \n",
- "4 https://arxiv.org/html/2412.16339v1 0.600000 \n",
- "5 None 0.000000 \n",
- "6 https://old.reddit.com/r/mlscaling/comments/1e... 1.000000 \n",
- "7 https://github.com/wassname/stoic_markdown/blo... 0.500000 \n",
- "8 https://www.lesswrong.com/posts/Rk2o8hjYmjENH8... 0.100000 \n",
- "9 https://www.cnbc.com/2024/10/22/anthropic-anno... 0.500000 \n",
- "10 https://www.lesswrong.com/posts/5jjk4CDnj9tA7u... 0.700000 \n",
- "11 https://truthsocial.com/@realDonaldTrump/posts... 0.300000 \n",
- "12 https://arxiv.org/html/2412.16339v1 0.750000 \n",
- "13 https://www.lesswrong.com/posts/GCHyDKfPXa5qsG... 0.677458 \n",
- "14 https://www.lesswrong.com/posts/4CmYSPc4HfRfWx... 0.677458 \n",
- "15 https://www.lesswrong.com/posts/Mak2kZuTq8Hpnq... 0.688044 \n",
- "16 https://www.lesswrong.com/posts/EhTMM77iKBTBxB... 0.540932 \n",
- "17 https://www.lesswrong.com/posts/bb5Tnjdrptu89r... 0.898161 \n",
- "18 https://www.lipsum.com/feed/html 0.000000 \n",
- "19 https://www.lesswrong.com/posts/9weLK2AJ9JEt2T... 0.500000 \n",
+ " url novelty \\\n",
+ "0 https://investors.palantir.com/news-details/20... 0.20 \n",
+ "1 https://arxiv.org/html/2412.20138v1 0.15 \n",
"\n",
- " date in_training \n",
- "0 2024-07-11 00:00:00+00:00 False \n",
- "1 2024-12-28 00:00:00+00:00 False \n",
- "2 2024-12-24 00:00:00+00:00 False \n",
- "3 2022-02-17 00:00:00+00:00 True \n",
- "4 2024-12-20 00:00:00+00:00 False \n",
- "5 2024-01-01 00:00:00+00:00 False \n",
- "6 2024-08-21 00:00:00+00:00 False \n",
- "7 2024-06-01 00:00:00+00:00 False \n",
- "8 2024-04-12 00:00:00+00:00 False \n",
- "9 2025-01-05 05:03:00+00:00 False \n",
- "10 2024-11-01 00:00:00+00:00 False \n",
- "11 2025-01-04 00:00:00+00:00 False \n",
- "12 2025-01-05 00:00:00+00:00 False \n",
- "13 2025-01-03 19:03:28.406000+00:00 False \n",
- "14 2025-01-04 22:59:57.376000+00:00 False \n",
- "15 2025-01-04 18:16:58.921000+00:00 False \n",
- "16 2025-01-04 18:06:02.387000+00:00 False \n",
- "17 2025-01-05 00:10:28.708000+00:00 False \n",
- "18 1900-01-01 00:00:00+00:00 True \n",
- "19 2007-02-19 00:00:00+00:00 True "
+ " date in_training \n",
+ "0 2024-07-11 00:00:00+00:00 False \n",
+ "1 2024-12-28 00:00:00+00:00 False "
]
},
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "MAX_LEN = 2000\n",
"import frontmatter\n",
"from pathlib import Path\n",
"from loguru import logger\n",
"\n",
"sample_files = sorted(Path(\"../samples/\").glob('*.md'))\n",
"# print(sample_files)\n",
+ "\n",
"max_chars = 2000\n",
+ "max_summary_frac = 0.1\n",
"samples = []\n",
"for f in sample_files:\n",
" print(f)\n",
@@ -455,7 +187,7 @@
"df = df[['title', 'f', 'content', 'url', 'novelty', 'date']]\n",
"df['date'] = pd.to_datetime(df['date'], utc=True)\n",
"df['in_training'] = df.date < '2024-01-01'\n",
- "df"
+ "df.head(2)"
]
},
{
@@ -474,7 +206,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -483,7 +215,7 @@
"True"
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -496,16 +228,24 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "TradingAgents framework uses multi-agent LLM for financial trading, mimicking real trading dynamics. Roles include analysts and traders with diverse risk profiles. Key features include market conditio\n",
+ "n assessors, risk management, and decision-making. Superior to baseline models, improves returns, Sharpe ratio, max drawdown.\n"
+ ]
+ },
{
"data": {
"text/plain": [
- "'\"TradingAgents\" is a multi-agent financial trading model powered by large language models. It features various specialized agents like analysts and traders, mimicking the dynamics of real-world trading firms. The system has shown improved trading performance over baseline models.'"
+ "(None, None)"
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -515,7 +255,11 @@
"from openai import OpenAI\n",
"from anycache import anycache\n",
"\n",
- "cache_file = \"../.anycache.pkl\"\n",
+ "cache_file = \"../.anycache\"\n",
+ "\n",
+ "# to clear\n",
+ "import shutil\n",
+ "shutil.rmtree(cache_file, ignore_errors=True)\n",
"\n",
"@anycache(cachedir=cache_file)\n",
"def summize_gpt4(text):\n",
@@ -535,8 +279,10 @@
"\n",
"@anycache(cachedir=cache_file)\n",
"def summarize_gpt4b(text):\n",
+ " l = int(len(text)*max_summary_frac)\n",
+ " # print(l)\n",
" client = OpenAI()\n",
- " inst = \"We want to compress then reconstruct text. Lets do the compression part. In short hand, record the most usefull information needed to reconstruct the text that can fit in <280 chars. Start with the most important, as extra text will be discarded\"\n",
+ " inst = \"We aim to compress then reconstruct a text. First lets do the compression. In short hand, record the information needed to reconstruct the text (type of document, writing style, suprising contenxt, etc). You have <{l} chars. Start with the most important, as extra text will be discarded\"\n",
" content = f\"{inst}\\n\\n## Text\\n\\n{text}\\n\\n## Instruction\\n\\n{inst}:\\n\\ntl;dr:\"\n",
" chat_completion = client.chat.completions.create(\n",
" messages=[\n",
@@ -549,15 +295,33 @@
" )\n",
" # print(content)\n",
" r = chat_completion.choices[0].message.content\n",
- " return r\n",
+ " return r[:l], r[l:]\n",
"\n",
- "r = summarize_gpt4b(samples[1][\"content\"])\n",
- "r"
+ "r, _ = summarize_gpt4b(samples[1][\"content\"])\n",
+ "print(r), print(_)"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "TradingAgents introduces a multi-agent financial trading framework using large language models. Agents specialize in roles like risk analysis and market condition evaluation, improving trading performance and showing potential for LLM frameworks in finance.\n"
+ ]
+ }
+ ],
+ "source": [
+ "r = summize_gpt4(samples[1][\"content\"])\n",
+ "print(r)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -566,7 +330,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -589,7 +353,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -609,7 +373,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -627,355 +391,12 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method', 'disable_exllama']. These kwargs are not used in .\n",
- "`low_cpu_mem_usage` was None, now default to True since model is quantized.\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 2.82it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.47it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Anthropic and Palantir Partner to Bring Claude AI Models to AWS for U.S. Government Intelligence and Defense Operations 15.34375 14.75\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.92it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.66it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit TradingAgents: Multi-Agents LLM Financial Trading Framework 20.484375 19.359375\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.58it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.47it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Flower Crowns and Furry Mishaps by MyPalAI 23.953125 26.046875\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.63it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.48it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Paradox's Box (Bobiverse) by Mark4man 41.21875 42.84375\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.67it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.60it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Deliberative Alignment: Reasoning Enables Safer Language Models 25.140625 26.09375\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.83it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.65it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit fake ai hoax paper made up by gpt-4 10.3984375 10.984375\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.63it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.54it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Hardware Hedging Against Scaling Regime Shifts (self.mlscaling) 39.1875 39.5625\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.48it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.43it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit How to Focus 5.890625 6.42578125\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.58it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.54it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Deontic Explorations In \"Paying To Talk To Slaves\" 15.4296875 16.65625\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.50it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.37it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Amazon-backed Anthropic debuts AI agents that can do complex tasks, racing against OpenAI, Microsoft and Google 12.46875 12.71875\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.61it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.47it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit OpenAI Email Archives from Musk v. Altman 18.578125 18.609375\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 7.22it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 7.05it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit President Trump Announces Morgan Ortagus as Deputy Special Presidential Envoy for Middle East Peace 16.65625 14.578125\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.90it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.62it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit CDC Report on Missouri H5N1 Serology Testing 13.6953125 14.609375\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.65it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.57it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Human study on AI spear phishing campaigns 17.28125 16.390625\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.84it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.60it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Parkinson's Law and the Ideology of Statistics 24.90625 24.1875\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.65it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.59it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit The Intelligence Curse 27.828125 30.15625\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.64it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.57it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit The Laws of Large Numbers 13.859375 14.6953125\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.56it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.48it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Whatβs the short timeline plan? 27.9375 28.328125\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.37it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.04it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit Lorem ipsum 8.0234375 10.2578125\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.63it/s]\n",
- "100%|ββββββββββ| 1/1 [00:00<00:00, 6.62it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "unsloth/Llama-3.2-1B-bnb-4bit politics is the mind-killer 17.65625 20.203125\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "summaries = {}\n",
- "data = []\n",
- "for model_name in models:\n",
- " model, tokenizer = load_model(model_name)\n",
- " for sample in samples:\n",
- " if sample['title'] not in summaries:\n",
- " summaries[sample['title']] = summarize_gpt4b(sample['content'])[:600]\n",
- " summary = summaries[sample['title']]\n",
"\n",
- " # before \n",
- " s1 = sample['content']\n",
- " results = perplexity_compute(data=s1, model=model, tokenizer=tokenizer, device='cuda')\n",
- " before = results['mean_perplexity']\n",
- "\n",
- " # after \n",
- " s2 = f\"\"\"\n",
- " High level summary: {summary}\n",
- "\n",
- "Text:\n",
- "{sample['content']}\n",
- " \"\"\"\n",
- " results = perplexity_compute(data=s2, model=model, tokenizer=tokenizer, device='cuda')\n",
- " after = np.array(results['perplexities'])[-len(s1):].mean()\n",
- "\n",
- " print(model_name, sample['title'], before, after)\n",
- " data.append(dict(before=before, \n",
- " after=after, \n",
- " model=model_name, \n",
- " # sample=sample['title'],\n",
- " # in_training=sample['in_training'], \n",
- " len=len(sample['content']),\n",
- " **sample\n",
- " ),\n",
- "\n",
- " )\n"
+ "from bs_writing_detector.metrics.ppx import perplexity_compute"
]
},
{
@@ -987,56 +408,113 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
- "ename": "UndefinedVariableError",
- "evalue": "name 'in_training' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/scope.py:231\u001b[0m, in \u001b[0;36mScope.resolve\u001b[0;34m(self, key, is_local)\u001b[0m\n\u001b[1;32m 230\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_resolvers:\n\u001b[0;32m--> 231\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolvers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 233\u001b[0m \u001b[38;5;66;03m# if we're here that means that we have no locals and we also have\u001b[39;00m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;66;03m# no resolvers\u001b[39;00m\n",
- "File \u001b[0;32m/usr/lib/python3.11/collections/__init__.py:1004\u001b[0m, in \u001b[0;36mChainMap.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1003\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[0;32m-> 1004\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__missing__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/usr/lib/python3.11/collections/__init__.py:996\u001b[0m, in \u001b[0;36mChainMap.__missing__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 995\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__missing__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[0;32m--> 996\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key)\n",
- "\u001b[0;31mKeyError\u001b[0m: 'in_training'",
- "\nDuring handling of the above exception, another exception occurred:\n",
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/scope.py:242\u001b[0m, in \u001b[0;36mScope.resolve\u001b[0;34m(self, key, is_local)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 239\u001b[0m \u001b[38;5;66;03m# last ditch effort we look in temporaries\u001b[39;00m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;66;03m# these are created when parsing indexing expressions\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;66;03m# e.g., df[df > 0]\u001b[39;00m\n\u001b[0;32m--> 242\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtemps\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
- "\u001b[0;31mKeyError\u001b[0m: 'in_training'",
- "\nThe above exception was the direct cause of the following exception:\n",
- "\u001b[0;31mUndefinedVariableError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[11], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# results\u001b[39;00m\n\u001b[1;32m 2\u001b[0m df2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(data)\u001b[38;5;241m.\u001b[39mset_index(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtitle\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m df2 \u001b[38;5;241m=\u001b[39m \u001b[43mdf2\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43min_training == False\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m df2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msummarizable\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m (df2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbefore\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m-\u001b[39m df2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mafter\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m/\u001b[39mdf2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbefore\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 6\u001b[0m df2\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/frame.py:4823\u001b[0m, in \u001b[0;36mDataFrame.query\u001b[0;34m(self, expr, inplace, **kwargs)\u001b[0m\n\u001b[1;32m 4821\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m0\u001b[39m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 4822\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 4823\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43meval\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4825\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 4826\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloc[res]\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/frame.py:4949\u001b[0m, in \u001b[0;36mDataFrame.eval\u001b[0;34m(self, expr, inplace, **kwargs)\u001b[0m\n\u001b[1;32m 4946\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\n\u001b[1;32m 4947\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresolvers\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtuple\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresolvers\u001b[39m\u001b[38;5;124m\"\u001b[39m, ())) \u001b[38;5;241m+\u001b[39m resolvers\n\u001b[0;32m-> 4949\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_eval\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/eval.py:336\u001b[0m, in \u001b[0;36meval\u001b[0;34m(expr, parser, engine, local_dict, global_dict, resolvers, level, target, inplace)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[38;5;66;03m# get our (possibly passed-in) scope\u001b[39;00m\n\u001b[1;32m 328\u001b[0m env \u001b[38;5;241m=\u001b[39m ensure_scope(\n\u001b[1;32m 329\u001b[0m level \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m,\n\u001b[1;32m 330\u001b[0m global_dict\u001b[38;5;241m=\u001b[39mglobal_dict,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 333\u001b[0m target\u001b[38;5;241m=\u001b[39mtarget,\n\u001b[1;32m 334\u001b[0m )\n\u001b[0;32m--> 336\u001b[0m parsed_expr \u001b[38;5;241m=\u001b[39m \u001b[43mExpr\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparser\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparser\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m engine \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumexpr\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m (\n\u001b[1;32m 339\u001b[0m is_extension_array_dtype(parsed_expr\u001b[38;5;241m.\u001b[39mterms\u001b[38;5;241m.\u001b[39mreturn_type)\n\u001b[1;32m 340\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(parsed_expr\u001b[38;5;241m.\u001b[39mterms, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moperand_types\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 344\u001b[0m )\n\u001b[1;32m 345\u001b[0m ):\n\u001b[1;32m 346\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 347\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEngine has switched to \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m because numexpr does not support \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mextension array dtypes. Please set your engine to python manually.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 349\u001b[0m \u001b[38;5;167;01mRuntimeWarning\u001b[39;00m,\n\u001b[1;32m 350\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 351\u001b[0m )\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:805\u001b[0m, in \u001b[0;36mExpr.__init__\u001b[0;34m(self, expr, engine, parser, env, level)\u001b[0m\n\u001b[1;32m 803\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparser \u001b[38;5;241m=\u001b[39m parser\n\u001b[1;32m 804\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_visitor \u001b[38;5;241m=\u001b[39m PARSERS[parser](\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparser)\n\u001b[0;32m--> 805\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mterms \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:824\u001b[0m, in \u001b[0;36mExpr.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mparse\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 821\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 822\u001b[0m \u001b[38;5;124;03m Parse an expression.\u001b[39;00m\n\u001b[1;32m 823\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 824\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_visitor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:417\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Module\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mSyntaxError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly a single expression is allowed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 416\u001b[0m expr \u001b[38;5;241m=\u001b[39m node\u001b[38;5;241m.\u001b[39mbody[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 417\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:420\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Expr\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvisit_Expr\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 420\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:715\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Compare\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 713\u001b[0m op \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranslate_In(ops[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m 714\u001b[0m binop \u001b[38;5;241m=\u001b[39m ast\u001b[38;5;241m.\u001b[39mBinOp(op\u001b[38;5;241m=\u001b[39mop, left\u001b[38;5;241m=\u001b[39mnode\u001b[38;5;241m.\u001b[39mleft, right\u001b[38;5;241m=\u001b[39mcomps[\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m--> 715\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbinop\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# recursive case: we have a chained comparison, a CMP b CMP c, etc.\u001b[39;00m\n\u001b[1;32m 718\u001b[0m left \u001b[38;5;241m=\u001b[39m node\u001b[38;5;241m.\u001b[39mleft\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:531\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_BinOp\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvisit_BinOp\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 531\u001b[0m op, op_class, left, right \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_transform_eq_ne\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 532\u001b[0m left, right \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maybe_downcast_constants(left, right)\n\u001b[1;32m 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maybe_evaluate_binop(op, op_class, left, right)\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:451\u001b[0m, in \u001b[0;36mBaseExprVisitor._maybe_transform_eq_ne\u001b[0;34m(self, node, left, right)\u001b[0m\n\u001b[1;32m 449\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_maybe_transform_eq_ne\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, left\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, right\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m left \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 451\u001b[0m left \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mleft\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mside\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mleft\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 452\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m right \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 453\u001b[0m right \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvisit(node\u001b[38;5;241m.\u001b[39mright, side\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mright\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:411\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 409\u001b[0m method \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvisit_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(node)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 410\u001b[0m visitor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, method)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvisitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/expr.py:541\u001b[0m, in \u001b[0;36mBaseExprVisitor.visit_Name\u001b[0;34m(self, node, **kwargs)\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvisit_Name\u001b[39m(\u001b[38;5;28mself\u001b[39m, node, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Term:\n\u001b[0;32m--> 541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mterm_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/ops.py:91\u001b[0m, in \u001b[0;36mTerm.__init__\u001b[0;34m(self, name, env, side, encoding)\u001b[0m\n\u001b[1;32m 89\u001b[0m tname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(name)\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_local \u001b[38;5;241m=\u001b[39m tname\u001b[38;5;241m.\u001b[39mstartswith(LOCAL_TAG) \u001b[38;5;129;01mor\u001b[39;00m tname \u001b[38;5;129;01min\u001b[39;00m DEFAULT_GLOBALS\n\u001b[0;32m---> 91\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_resolve_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;241m=\u001b[39m encoding\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/ops.py:115\u001b[0m, in \u001b[0;36mTerm._resolve_name\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m local_name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv\u001b[38;5;241m.\u001b[39mscope \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv\u001b[38;5;241m.\u001b[39mscope[local_name], \u001b[38;5;28mtype\u001b[39m\n\u001b[1;32m 112\u001b[0m ):\n\u001b[1;32m 113\u001b[0m is_local \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_local\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_local\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupdate(res)\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(res, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mndim\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m res\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m2\u001b[39m:\n",
- "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/computation/scope.py:244\u001b[0m, in \u001b[0;36mScope.resolve\u001b[0;34m(self, key, is_local)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtemps[key]\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 244\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m UndefinedVariableError(key, is_local) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n",
- "\u001b[0;31mUndefinedVariableError\u001b[0m: name 'in_training' is not defined"
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method', 'disable_exllama']. These kwargs are not used in .\n",
+ "`low_cpu_mem_usage` was None, now default to True since model is quantized.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "unsloth/Llama-3.2-1B-bnb-4bit\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "1d28b339f7c3495c97adef2c4f1938ad",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/20 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2.731894616981088 2.464229501783848 | Anthropic and Palantir Partner to Bring Claude AI Models to AWS for U.S. Government Intelligence and Defense Operations\n",
+ "3.018628165803172 2.8031317269160456 | TradingAgents: Multi-Agents LLM Financial Trading Framework\n"
]
}
],
+ "source": [
+ "from tqdm.auto import tqdm\n",
+ "\n",
+ "summaries = {}\n",
+ "data = []\n",
+ "for model_name in models:\n",
+ " model, tokenizer = load_model(model_name)\n",
+ " print(model_name)\n",
+ " for i in tqdm(range(len(df))):\n",
+ " sample = df.iloc[i]\n",
+ " if sample['title'] not in summaries:\n",
+ " summaries[sample['title']] = summarize_gpt4b(sample['content'])[0]\n",
+ " summary = summaries[sample['title']]\n",
+ "\n",
+ " # before \n",
+ " s1 = sample['content']\n",
+ " before = perplexity_compute(data=s1, model=model, tokenizer=tokenizer, device='cuda')['nlls'][0]\n",
+ "\n",
+ " n_tokens = len(tokenizer(s1)['input_ids'])\n",
+ "\n",
+ " # after \n",
+ " s2 = f\"\"\"\n",
+ " High level summary: {summary}\n",
+ "\n",
+ "Text:\n",
+ "{sample['content']}\n",
+ " \"\"\"\n",
+ "\n",
+ " # TODO add attn mask\n",
+ " after = perplexity_compute(data=s2, model=model, tokenizer=tokenizer, device='cuda')['nlls'][0][-n_tokens:]\n",
+ "\n",
+ " print(f\"{before.mean()} {after.mean()} | {sample['title']}\")\n",
+ " data.append(dict(\n",
+ " before_mean=before.mean(), \n",
+ " before_sum=before.sum().item(),\n",
+ " before_std=before.std(),\n",
+ " before_min=before.min(),\n",
+ " before_max=before.max(),\n",
+ " before_ppx=np.exp(before.mean().item()),\n",
+ " after_mean=after.mean(),\n",
+ " after_sum=after.sum().item(),\n",
+ " after_std=after.std(),\n",
+ " after_min=after.min(),\n",
+ " after_max=after.max(), \n",
+ " after_ppx=np.exp(after.mean().item()),\n",
+ " model=model_name, \n",
+ " summary=summary,\n",
+ " # sample=sample['title'],\n",
+ " # in_training=sample['in_training'], \n",
+ " len=len(sample['content']),\n",
+ " n_tokens=n_tokens,\n",
+ " **sample\n",
+ " ),\n",
+ "\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"# results\n",
"df2 = pd.DataFrame(data).set_index('title')\n",
"df2 = df2.query('in_training == False')\n",
- "df2[\"summarizable%\"] = (df2[\"before\"] - df2[\"after\"])/df2[\"before\"]\n",
- "\n",
"df2"
]
},
@@ -1046,16 +524,8 @@
"metadata": {},
"outputs": [],
"source": [
- "df[\"improvement%\"] = (df[\"before\"] - df[\"after\"]) / df[\"before\"] \n",
- "df[\"improvement\"] = (df[\"before\"] - df[\"after\"]) \n",
- "df[\"summarizable\"] = df[\"improvement\"] > .0\n",
- "df['suprising'] = df[\"before\"] > 15\n",
- "df['BS'] = ~df[\"summarizable\"] | ~df['suprising']\n",
- "for n,d in df.groupby(\"model\"):\n",
- " print(n)\n",
- " d = d[[ 'before', 'after', \"improvement\", \"improvement%\", 'suprising', 'summarizable', ]].sort_values(\"improvement\", ascending=False)\n",
- " print(d.to_markdown())\n",
- " display(d)"
+ "# QC summary\n",
+ "df2.summary.str.len().describe()"
]
},
{
@@ -1064,7 +534,33 @@
"metadata": {},
"outputs": [],
"source": [
- "# TODO turn into a single metric, correlate with novelty label"
+ "for n,d in df2.groupby(\"model\"):\n",
+ " for stat in ['mean', 'std', 'min', 'max']:\n",
+ " df2[f\"improvement%_{stat}\"] = (df2[f\"before_{stat}\"] - df2[f\"after_{stat}\"]) / df2[f\"before_{stat}\"]\n",
+ " df2[f\"improvement_{stat}\"] = (df2[f\"before_{stat}\"] - df2[f\"after_{stat}\"])\n",
+ " df2[f\"summarizable_{stat}\"] = df2[f\"improvement_{stat}\"] > 1\n",
+ " df2[f\"summarizable2_{stat}\"] = df2[f\"improvement%_{stat}\"] > 0.05\n",
+ " df2[f'suprising_{stat}'] = df2[f\"before_{stat}\"] > 15\n",
+ " df2[f'BS_{stat}'] = ~df2[f\"summarizable_{stat}\"] | ~df2[f'suprising_{stat}']\n",
+ "\n",
+ "\n",
+ " print(n)\n",
+ " # d = d[[ 'before', 'after', \"improvement\", \"improvement%\", 'suprising', 'summarizable', 'summarizable2', 'novelty' ]].sort_values(\"improvement%\", ascending=True)\n",
+ " # print(d.to_markdown())\n",
+ " # display(d)\n",
+ "\n",
+ " # TODO turn into a single metric, correlate with novelty label\n",
+ " r = df2.select_dtypes(include=np.number).corr()['novelty'].abs().sort_values()\n",
+ " display(r)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.select_dtypes(include=np.number).corr()['novelty'].sort_values()"
]
},
{
diff --git a/poetry.lock b/poetry.lock
index 49462d7..506be83 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1317,6 +1317,27 @@ qtconsole = ["qtconsole"]
test = ["packaging", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"]
test-extra = ["curio", "ipython[test]", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "trio"]
+[[package]]
+name = "ipywidgets"
+version = "8.1.5"
+description = "Jupyter interactive widgets"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "ipywidgets-8.1.5-py3-none-any.whl", hash = "sha256:3290f526f87ae6e77655555baba4f36681c555b8bdbbff430b70e52c34c86245"},
+ {file = "ipywidgets-8.1.5.tar.gz", hash = "sha256:870e43b1a35656a80c18c9503bbf2d16802db1cb487eec6fab27d683381dde17"},
+]
+
+[package.dependencies]
+comm = ">=0.1.3"
+ipython = ">=6.1.0"
+jupyterlab-widgets = ">=3.0.12,<3.1.0"
+traitlets = ">=4.3.1"
+widgetsnbextension = ">=4.0.12,<4.1.0"
+
+[package.extras]
+test = ["ipykernel", "jsonschema", "pytest (>=3.6.0)", "pytest-cov", "pytz"]
+
[[package]]
name = "jedi"
version = "0.19.2"
@@ -1491,6 +1512,17 @@ traitlets = ">=5.3"
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "traitlets"]
test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout"]
+[[package]]
+name = "jupyterlab-widgets"
+version = "3.0.13"
+description = "Jupyter interactive widgets for JupyterLab"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "jupyterlab_widgets-3.0.13-py3-none-any.whl", hash = "sha256:e3cda2c233ce144192f1e29914ad522b2f4c40e77214b0cc97377ca3d323db54"},
+ {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
+]
+
[[package]]
name = "kiwisolver"
version = "1.4.8"
@@ -4101,6 +4133,17 @@ files = [
{file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
]
+[[package]]
+name = "widgetsnbextension"
+version = "4.0.13"
+description = "Jupyter interactive widgets for Jupyter Notebook"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71"},
+ {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"},
+]
+
[[package]]
name = "win32-setctime"
version = "1.2.0"
@@ -4346,4 +4389,4 @@ propcache = ">=0.2.0"
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.13"
-content-hash = "2a7966819d0850ce36c5e826aa1ca53d08d8c223fde4126b48b49b39fa055cc4"
+content-hash = "f92d5fdffcee2de350804b17ff73bfa204a83470db68187c1dc78ee49e8b7eb7"
diff --git a/pyproject.toml b/pyproject.toml
index 092c4b0..c9e6689 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,8 @@ matplotlib = "^3.8.0"
python-frontmatter = "^1.0.1"
loguru = "^0.7.2"
anycache = "^2.2.0"
+ipywidgets = "^8.1.5"
+ipykernel = "^6.29.5"
[[tool.poetry.source]]
name = "pytorch"