mirror of
https://github.com/wassname/detect_bs_text.git
synced 2026-07-02 11:35:59 +08:00
606 lines
42 KiB
Plaintext
606 lines
42 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# getting lesswrong data with novelty proxy\n",
|
|
"\n",
|
|
"maybe we can use score or baseVotes as a proxy for quality"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"last_date = '2024-01-01'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## with vanilla requests\n",
|
|
"\n",
|
|
"\n",
|
|
"pip install markdownify"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import requests\n",
|
|
"from loguru import logger\n",
|
|
"import time\n",
|
|
"from dataclasses import dataclass\n",
|
|
"from markdownify import markdownify\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"@dataclass\n",
|
|
"class GreaterWrong:\n",
|
|
"\n",
|
|
" \"\"\"\n",
|
|
" This class allows you to scrape posts and comments from GreaterWrong.\n",
|
|
" GreaterWrong contains all the posts from LessWrong (which contains the Alignment Forum) and the EA Forum.\n",
|
|
" from https://github.com/StampyAI/alignment-research-dataset/blob/main/align_data/sources/greaterwrong/greaterwrong.py#L156\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" base_url: str = 'https://www.lesswrong.com'\n",
|
|
" start_year: int = 2000\n",
|
|
" min_karma: int = -10000\n",
|
|
" \"\"\"Posts must have at least this much karma to be returned.\"\"\"\n",
|
|
" af: bool = False\n",
|
|
" \"\"\"Whether alignment forum posts should be returned\"\"\"\n",
|
|
"\n",
|
|
" limit = 50\n",
|
|
" COOLDOWN = 0.5\n",
|
|
" done_key = \"url\"\n",
|
|
" lazy_eval = True\n",
|
|
" source_type = 'GreaterWrong'\n",
|
|
" _outputted_items = (set(), set())\n",
|
|
" \n",
|
|
"\n",
|
|
" def make_query(self, after: str):\n",
|
|
" return f'''\n",
|
|
" {{\n",
|
|
" posts(input: {{\n",
|
|
" terms: {{\n",
|
|
" excludeEvents: true\n",
|
|
" view: \"old\"\n",
|
|
" af: {self.af}\n",
|
|
" limit: {self.limit}\n",
|
|
" karmaThreshold: {self.min_karma}\n",
|
|
" after: \"{after}\"\n",
|
|
" filter: \"tagged\"\n",
|
|
" }}\n",
|
|
" }}) {{\n",
|
|
" totalCount\n",
|
|
" results {{\n",
|
|
" _id\n",
|
|
" title\n",
|
|
" slug\n",
|
|
" pageUrl\n",
|
|
" postedAt\n",
|
|
" modifiedAt\n",
|
|
" emojiReactors\n",
|
|
" score\n",
|
|
" extendedScore\n",
|
|
" baseScore\n",
|
|
" voteCount\n",
|
|
" commentCount\n",
|
|
" wordCount\n",
|
|
" tags {{\n",
|
|
" name\n",
|
|
" }}\n",
|
|
" user {{\n",
|
|
" displayName\n",
|
|
" }}\n",
|
|
" coauthors {{\n",
|
|
" displayName\n",
|
|
" }}\n",
|
|
" af\n",
|
|
" htmlBody\n",
|
|
" allVotes {{\n",
|
|
" authorId\n",
|
|
" _id\n",
|
|
" power\n",
|
|
" afPower\n",
|
|
" isUnvote\n",
|
|
" votedAt\n",
|
|
" }}\n",
|
|
" }}\n",
|
|
" }}\n",
|
|
" }}\n",
|
|
" '''\n",
|
|
"\n",
|
|
" def fetch_posts(self, query: str):\n",
|
|
" res = requests.post(\n",
|
|
" f\"{self.base_url}/graphql\",\n",
|
|
" # The GraphQL endpoint returns a 403 if the user agent isn't set... Makes sense, but is annoying\n",
|
|
" headers={\n",
|
|
" \"User-Agent\": \"Mozilla /5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0\"\n",
|
|
" },\n",
|
|
" json={\"query\": query},\n",
|
|
" )\n",
|
|
" try:\n",
|
|
" res.raise_for_status()\n",
|
|
" except requests.exceptions.HTTPError:\n",
|
|
" logger.error(f\"Failed to fetch posts: {res.text}\")\n",
|
|
" raise\n",
|
|
"\n",
|
|
" try:\n",
|
|
" return res.json()[\"data\"][\"posts\"]\n",
|
|
" except KeyError:\n",
|
|
" raise ValueError(f\"Could not parse response: {res.text}\")\n",
|
|
"\n",
|
|
"\n",
|
|
" @property\n",
|
|
" def items_list(self):\n",
|
|
" next_date = self.last_date_published\n",
|
|
" logger.info(\"Starting from {next_date}\")\n",
|
|
" last_item = None\n",
|
|
" while next_date:\n",
|
|
" logger.info(f\"Fetching posts after {next_date}\")\n",
|
|
" posts = self.fetch_posts(self.make_query(next_date))\n",
|
|
" if not posts[\"results\"]:\n",
|
|
" return\n",
|
|
"\n",
|
|
" # If the only item we find was the one we advanced our iterator to, we're done\n",
|
|
" if len(posts[\"results\"]) == 1 and last_item and posts[\"results\"][0][\"pageUrl\"] == last_item[\"pageUrl\"]:\n",
|
|
" return\n",
|
|
"\n",
|
|
" for post in posts[\"results\"]:\n",
|
|
" if post[\"htmlBody\"]:\n",
|
|
" yield post\n",
|
|
"\n",
|
|
" last_item = posts[\"results\"][-1]\n",
|
|
" new_next_date = posts[\"results\"][-1][\"postedAt\"]\n",
|
|
" if next_date == new_next_date:\n",
|
|
" raise ValueError(f'could not advance through dataset, next date did not advance after {next_date}')\n",
|
|
"\n",
|
|
" next_date = new_next_date\n",
|
|
" time.sleep(self.COOLDOWN)\n",
|
|
"\n",
|
|
" def process_entry(self, item):\n",
|
|
" return self.make_data_entry(\n",
|
|
" {\n",
|
|
" \"title\": item[\"title\"],\n",
|
|
" \"text\": markdownify(item[\"htmlBody\"]).strip(),\n",
|
|
" \"url\": item[\"pageUrl\"],\n",
|
|
" \"date_published\": self._get_published_date(item),\n",
|
|
" \"modified_at\": item[\"modifiedAt\"],\n",
|
|
" \"source\": self.name,\n",
|
|
" \"source_type\": self.source_type,\n",
|
|
" \"votes\": item[\"voteCount\"],\n",
|
|
" \"karma\": item[\"baseScore\"],\n",
|
|
" \"tags\": [t[\"name\"] for t in item[\"tags\"]],\n",
|
|
" \"words\": item[\"wordCount\"],\n",
|
|
" \"comment_count\": item[\"commentCount\"],\n",
|
|
" \"authors\": self.extract_authors(item),\n",
|
|
" }\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"gw = GreaterWrong()\n",
|
|
"gw.last_date_published = '2023-01-01'\n",
|
|
"\n",
|
|
"import pandas as pd\n",
|
|
"from tqdm.auto import tqdm\n",
|
|
"\n",
|
|
"cache_file = Path('output/01greaterwrong.json')\n",
|
|
"cache_file.parent.mkdir(parents=True, exist_ok=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"https://www.lesswrong.com/graphiql"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "fe4e30aeb2614c759ca799709a89f5c3",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"0it [00:00, ?it/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[32m2025-07-26 11:17:04.797\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mitems_list\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mStarting from {next_date}\u001b[0m\n",
|
|
"\u001b[32m2025-07-26 11:17:04.798\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mitems_list\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1mFetching posts after 2023-01-01\u001b[0m\n",
|
|
"\u001b[32m2025-07-26 11:17:05.927\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mfetch_posts\u001b[0m:\u001b[36m98\u001b[0m - \u001b[31m\u001b[1mFailed to fetch posts: {\"errors\":[{\"message\":\"Expected value of type \\\"JSON\\\", found {excludeEvents: true, view: \\\"old\\\", af: False, limit: 50, karmaThreshold: -10000, after: \\\"2023-01-01\\\", filter: \\\"tagged\\\"}; JSON cannot represent value: False\",\"locations\":[{\"line\":4,\"column\":24}],\"extensions\":{\"code\":\"GRAPHQL_VALIDATION_FAILED\"}},{\"message\":\"Cannot query field \\\"allVotes\\\" on type \\\"Post\\\".\",\"locations\":[{\"line\":40,\"column\":21}],\"extensions\":{\"code\":\"GRAPHQL_VALIDATION_FAILED\"}}]}\n",
|
|
"\u001b[0m\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "HTTPError",
|
|
"evalue": "400 Client Error: Bad Request for url: https://www.lesswrong.com/graphql",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
"\u001b[31mHTTPError\u001b[39m Traceback (most recent call last)",
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 7\u001b[39m posts = []\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpost\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgw\u001b[49m\u001b[43m.\u001b[49m\u001b[43mitems_list\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43mposts\u001b[49m\u001b[43m.\u001b[49m\u001b[43mappend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpost\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 11\u001b[39m cache_file.write_text(json.dumps(posts, indent=\u001b[32m2\u001b[39m))\n",
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/notebook.py:250\u001b[39m, in \u001b[36mtqdm_notebook.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 248\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 249\u001b[39m it = \u001b[38;5;28msuper\u001b[39m().\u001b[34m__iter__\u001b[39m()\n\u001b[32m--> \u001b[39m\u001b[32m250\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mit\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 251\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# return super(tqdm...) will not catch exception\u001b[39;49;00m\n\u001b[32m 252\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[32m 253\u001b[39m \u001b[38;5;66;03m# NB: except ... [ as ...] breaks IPython async KeyboardInterrupt\u001b[39;00m\n",
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/std.py:1181\u001b[39m, in \u001b[36mtqdm.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1178\u001b[39m time = \u001b[38;5;28mself\u001b[39m._time\n\u001b[32m 1180\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1181\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miterable\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 1182\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[32m 1183\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Update and possibly print the progressbar.\u001b[39;49;00m\n\u001b[32m 1184\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;49;00m\n",
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 114\u001b[39m, in \u001b[36mGreaterWrong.items_list\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 112\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m next_date:\n\u001b[32m 113\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFetching posts after \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnext_date\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m posts = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfetch_posts\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmake_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_date\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 115\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m posts[\u001b[33m\"\u001b[39m\u001b[33mresults\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m 116\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 96\u001b[39m, in \u001b[36mGreaterWrong.fetch_posts\u001b[39m\u001b[34m(self, query)\u001b[39m\n\u001b[32m 87\u001b[39m res = requests.post(\n\u001b[32m 88\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.base_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/graphql\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 89\u001b[39m \u001b[38;5;66;03m# The GraphQL endpoint returns a 403 if the user agent isn't set... Makes sense, but is annoying\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 93\u001b[39m json={\u001b[33m\"\u001b[39m\u001b[33mquery\u001b[39m\u001b[33m\"\u001b[39m: query},\n\u001b[32m 94\u001b[39m )\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m \u001b[43mres\u001b[49m\u001b[43m.\u001b[49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m requests.exceptions.HTTPError:\n\u001b[32m 98\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to fetch posts: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mres.text\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
|
|
"\u001b[36mFile \u001b[39m\u001b[32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/requests/models.py:1026\u001b[39m, in \u001b[36mResponse.raise_for_status\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1021\u001b[39m http_error_msg = (\n\u001b[32m 1022\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.status_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m Server Error: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mreason\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m for url: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.url\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 1023\u001b[39m )\n\u001b[32m 1025\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[32m-> \u001b[39m\u001b[32m1026\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response=\u001b[38;5;28mself\u001b[39m)\n",
|
|
"\u001b[31mHTTPError\u001b[39m: 400 Client Error: Bad Request for url: https://www.lesswrong.com/graphql"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"if cache_file.exists():\n",
|
|
" with cache_file.open() as f:\n",
|
|
" posts = json.load(f)\n",
|
|
" print(f'Loaded {len(posts)} posts from cache')\n",
|
|
"else:\n",
|
|
" \n",
|
|
" posts = []\n",
|
|
" for post in tqdm(gw.items_list):\n",
|
|
" posts.append(post)\n",
|
|
"\n",
|
|
" cache_file.write_text(json.dumps(posts, indent=2))\n",
|
|
"len(posts)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
"RangeIndex: 9346 entries, 0 to 9345\n",
|
|
"Data columns (total 18 columns):\n",
|
|
" # Column Non-Null Count Dtype \n",
|
|
"--- ------ -------------- ----- \n",
|
|
" 0 _id 9346 non-null object \n",
|
|
" 1 title 9346 non-null object \n",
|
|
" 2 slug 9346 non-null object \n",
|
|
" 3 pageUrl 9346 non-null object \n",
|
|
" 4 postedAt 9346 non-null datetime64[ns, UTC]\n",
|
|
" 5 modifiedAt 9346 non-null datetime64[ns, UTC]\n",
|
|
" 6 score 9346 non-null float64 \n",
|
|
" 7 extendedScore 7034 non-null object \n",
|
|
" 8 baseScore 9346 non-null int64 \n",
|
|
" 9 voteCount 9346 non-null int64 \n",
|
|
" 10 commentCount 9346 non-null int64 \n",
|
|
" 11 wordCount 9346 non-null int64 \n",
|
|
" 12 tags 9346 non-null object \n",
|
|
" 13 user 9270 non-null object \n",
|
|
" 14 coauthors 9346 non-null object \n",
|
|
" 15 af 9346 non-null bool \n",
|
|
" 16 htmlBody 9346 non-null object \n",
|
|
" 17 allVotes 9346 non-null object \n",
|
|
"dtypes: bool(1), datetime64[ns, UTC](2), float64(1), int64(4), object(10)\n",
|
|
"memory usage: 1.2+ MB\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.DataFrame(posts)\n",
|
|
"df.drop(columns=['emojiReactors'], inplace=True)\n",
|
|
"for col in ['postedAt', 'modifiedAt']:\n",
|
|
" df[col] = pd.to_datetime(df[col])\n",
|
|
"p_file = Path('output/01greaterwrong.parquet')\n",
|
|
"df.to_parquet(p_file)\n",
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = df[['title', 'pageUrl', 'modifiedAt', 'htmlBody', 'score', 'baseScore', 'voteCount', 'wordCount', 'slug']]\n",
|
|
"df = df[\n",
|
|
" (df['modifiedAt'] > last_date)\n",
|
|
" & (df['voteCount'] > 10)\n",
|
|
" ].sort_values('score', ascending=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>score</th>\n",
|
|
" <th>baseScore</th>\n",
|
|
" <th>voteCount</th>\n",
|
|
" <th>wordCount</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>2385.000000</td>\n",
|
|
" <td>2385.000000</td>\n",
|
|
" <td>2385.000000</td>\n",
|
|
" <td>2385.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.018153</td>\n",
|
|
" <td>71.339203</td>\n",
|
|
" <td>36.330398</td>\n",
|
|
" <td>2963.753040</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.104511</td>\n",
|
|
" <td>68.800261</td>\n",
|
|
" <td>38.117311</td>\n",
|
|
" <td>3937.558236</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>-0.017480</td>\n",
|
|
" <td>-50.000000</td>\n",
|
|
" <td>11.000000</td>\n",
|
|
" <td>0.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>0.001787</td>\n",
|
|
" <td>32.000000</td>\n",
|
|
" <td>16.000000</td>\n",
|
|
" <td>730.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>0.003472</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>24.000000</td>\n",
|
|
" <td>1660.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>0.007957</td>\n",
|
|
" <td>86.000000</td>\n",
|
|
" <td>40.000000</td>\n",
|
|
" <td>3445.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>3.236718</td>\n",
|
|
" <td>677.000000</td>\n",
|
|
" <td>499.000000</td>\n",
|
|
" <td>57468.000000</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" score baseScore voteCount wordCount\n",
|
|
"count 2385.000000 2385.000000 2385.000000 2385.000000\n",
|
|
"mean 0.018153 71.339203 36.330398 2963.753040\n",
|
|
"std 0.104511 68.800261 38.117311 3937.558236\n",
|
|
"min -0.017480 -50.000000 11.000000 0.000000\n",
|
|
"25% 0.001787 32.000000 16.000000 730.000000\n",
|
|
"50% 0.003472 50.000000 24.000000 1660.000000\n",
|
|
"75% 0.007957 86.000000 40.000000 3445.000000\n",
|
|
"max 3.236718 677.000000 499.000000 57468.000000"
|
|
]
|
|
},
|
|
"execution_count": 111,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.describe()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"novelty is baseScore normalised to [0, 1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/core/arraylike.py:396: RuntimeWarning: invalid value encountered in log\n",
|
|
" result = getattr(ufunc, method)(*inputs, **kwargs)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<Axes: >"
|
|
]
|
|
},
|
|
"execution_count": 141,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAraklEQVR4nO3de3BUZZ7/8U8SOg1BOjEwSSdLQMQLhIuwINCj4yCEBEihjqkaGViIFgsrE6wasosYBQmgwFKWOmNFWGcZcGvIMIMlzgqRJMACiwRRhhTXZQdE0YGEVZYEyNJ0kvP7Y35pbBMunfTl6eT9quoK5/TTz/meryedj6dPd0dZlmUJAADAINHhLgAAAOD7CCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAON0CncBrdHY2KizZ8+qW7duioqKCnc5AADgNliWpUuXLik1NVXR0Tc/RxKRAeXs2bNKS0sLdxkAAKAVvvrqK/Xs2fOmYyIyoHTr1k3SX3fQ4XAEdG6Px6OysjJlZmbKZrMFdG5cR59Dgz6HBn0ODfocOsHqdW1trdLS0rx/x28mIgNK08s6DocjKAElLi5ODoeDX4Agos+hQZ9Dgz6HBn0OnWD3+nYuz+AiWQAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjdAp3AQDQ3t31wpY2z/HFiuwAVAJEDs6gAAAA4xBQAACAcfwKKKtWrdLgwYPlcDjkcDjkcrn00Ucfee8fPXq0oqKifG7PPvuszxxnzpxRdna24uLilJSUpHnz5qm+vj4wewMAANoFv65B6dmzp1asWKF7771XlmXp3Xff1eOPP66DBw9qwIABkqSZM2dqyZIl3sfExcV5/93Q0KDs7Gw5nU7t3btX586d0/Tp02Wz2bRs2bIA7RIAAIh0fgWUSZMm+Sy/+uqrWrVqlfbt2+cNKHFxcXI6nS0+vqysTMeOHdO2bduUnJysIUOGaOnSpZo/f74KCwsVGxvbyt0AAADtSavfxdPQ0KCNGzfqypUrcrlc3vXr16/Xb3/7WzmdTk2aNEkLFy70nkWpqKjQoEGDlJyc7B2flZWl2bNn6+jRoxo6dGiL23K73XK73d7l2tpaSZLH45HH42ntLrSoab5Azwtf9Dk06HNo3KrP9hgrYNvoyDieQydYvfZnvijLsvz6zTl8+LBcLpeuXr2qO+64Q8XFxZo4caIk6Z133lHv3r2VmpqqQ4cOaf78+RoxYoTef/99SdKsWbP05ZdfqrS01DtfXV2dunbtqpKSEk2YMKHFbRYWFmrx4sXN1hcXF/u8hAQAAMxVV1enKVOmqKamRg6H46Zj/T6Dcv/996uyslI1NTV67733lJubq127dik9PV2zZs3yjhs0aJBSUlI0duxYnTp1Sn379vV/T/6/goIC5efne5dra2uVlpamzMzMW+6gvzwej8rLyzVu3DjZbLaAzo3r6HNo0OfQuFWfBxaWtvAo/xwpzGrzHJGO4zl0gtXrpldAboffASU2Nlb33HOPJGnYsGH69NNP9ctf/lL/8i//0mzsyJEjJUknT55U37595XQ6tX//fp8x1dXVknTD61YkyW63y263N1tvs9mCdpAGc25cR59Dgz6Hxo367G6ICsjc+CuO59AJdK/9mavNn4PS2Njoc33Id1VWVkqSUlJSJEkul0uHDx/W+fPnvWPKy8vlcDiUnp7e1lIAAEA74dcZlIKCAk2YMEG9evXSpUuXVFxcrJ07d6q0tFSnTp3yXo/SvXt3HTp0SHPnztUjjzyiwYMHS5IyMzOVnp6uadOmaeXKlaqqqtKCBQuUl5fX4hkSAADQMfkVUM6fP6/p06fr3Llzio+P1+DBg1VaWqpx48bpq6++0rZt2/Tmm2/qypUrSktLU05OjhYsWOB9fExMjDZv3qzZs2fL5XKpa9euys3N9fncFAAAAL8Cypo1a254X1pamnbt2nXLOXr37q2SkhJ/NgsAADoYvosHAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDh+BZRVq1Zp8ODBcjgccjgccrlc+uijj7z3X716VXl5eerevbvuuOMO5eTkqLq62meOM2fOKDs7W3FxcUpKStK8efNUX18fmL0BAADtQid/Bvfs2VMrVqzQvffeK8uy9O677+rxxx/XwYMHNWDAAM2dO1dbtmzRxo0bFR8frzlz5ujJJ5/Uxx9/LElqaGhQdna2nE6n9u7dq3Pnzmn69Omy2WxatmxZUHYQANqDu17YEpB5vliRHZB5gGDzK6BMmjTJZ/nVV1/VqlWrtG/fPvXs2VNr1qxRcXGxxowZI0lau3at+vfvr3379mnUqFEqKyvTsWPHtG3bNiUnJ2vIkCFaunSp5s+fr8LCQsXGxgZuzwAAQMTyK6B8V0NDgzZu3KgrV67I5XLpwIED8ng8ysjI8I7p16+fevXqpYqKCo0aNUoVFRUaNGiQkpOTvWOysrI0e/ZsHT16VEOHDm1xW263W26327tcW1srSfJ4PPJ4PK3dhRY1zRfoeeGLPocGfQ6NW/XZHmOFspybiuRjgeM5dILVa3/m8zugHD58WC6XS1evXtUdd9yhTZs2KT09XZWVlYqNjVVCQoLP+OTkZFVVVUmSqqqqfMJJ0/1N993I8uXLtXjx4mbry8rKFBcX5+8u3Jby8vKgzAtf9Dk06HNo3KjPK0eEuJCbKCkpCXcJbcbxHDqB7nVdXd1tj/U7oNx///2qrKxUTU2N3nvvPeXm5mrXrl3+TuOXgoIC5efne5dra2uVlpamzMxMORyOgG7L4/GovLxc48aNk81mC+jcuI4+hwZ9Do1b9XlgYWkYqmrZkcKscJfQahzPoROsXje9AnI7/A4osbGxuueeeyRJw4YN06effqpf/vKXeuqpp3Tt2jVdvHjR5yxKdXW1nE6nJMnpdGr//v0+8zW9y6dpTEvsdrvsdnuz9TabLWgHaTDnxnX0OTToc2jcqM/uhqgwVNOy9nAccDyHTqB77c9cbf4clMbGRrndbg0bNkw2m03bt2/33nfixAmdOXNGLpdLkuRyuXT48GGdP3/eO6a8vFwOh0Pp6eltLQUAALQTfp1BKSgo0IQJE9SrVy9dunRJxcXF2rlzp0pLSxUfH68ZM2YoPz9fiYmJcjgceu655+RyuTRq1ChJUmZmptLT0zVt2jStXLlSVVVVWrBggfLy8lo8QwIAADomvwLK+fPnNX36dJ07d07x8fEaPHiwSktLNW7cOEnSG2+8oejoaOXk5MjtdisrK0tvv/229/ExMTHavHmzZs+eLZfLpa5duyo3N1dLliwJ7F4BAICI5ldAWbNmzU3v79y5s4qKilRUVHTDMb17924XV5EDAIDg4bt4AACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxukU7gIAwGR3vbDllmPsMZZWjpAGFpbK3RAVgqpa73b251a+WJEdgEqAm+MMCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4fgWU5cuX68EHH1S3bt2UlJSkJ554QidOnPAZM3r0aEVFRfncnn32WZ8xZ86cUXZ2tuLi4pSUlKR58+apvr6+7XsDAADahU7+DN61a5fy8vL04IMPqr6+Xi+++KIyMzN17Ngxde3a1Ttu5syZWrJkiXc5Li7O+++GhgZlZ2fL6XRq7969OnfunKZPny6bzaZly5YFYJcAAECk8yugbN261Wd53bp1SkpK0oEDB/TII49418fFxcnpdLY4R1lZmY4dO6Zt27YpOTlZQ4YM0dKlSzV//nwVFhYqNja2FbsBAADaE78CyvfV1NRIkhITE33Wr1+/Xr/97W/ldDo1adIkLVy40HsWpaKiQoMGDVJycrJ3fFZWlmbPnq2jR49q6NChzbbjdrvldru9y7W1tZIkj8cjj8fTll1opmm+QM8LX/Q5NOhz29ljrFuPibZ8frZ34TqeOJ5DJ1i99me+KMuyWvUb1djYqMcee0wXL17Unj17vOvfeecd9e7dW6mpqTp06JDmz5+vESNG6P3335ckzZo1S19++aVKS0u9j6mrq1PXrl1VUlKiCRMmNNtWYWGhFi9e3Gx9cXGxz8tHAADAXHV1dZoyZYpqamrkcDhuOrbVZ1Dy8vJ05MgRn3Ai/TWANBk0aJBSUlI0duxYnTp1Sn379m3VtgoKCpSfn+9drq2tVVpamjIzM2+5g/7yeDwqLy/XuHHjZLPZAjo3rqPPoUGf225gYektx9ijLS0d3qiFn0XL3RgVgqrC60hhVli2y/EcOsHqddMrILejVQFlzpw52rx5s3bv3q2ePXvedOzIkSMlSSdPnlTfvn3ldDq1f/9+nzHV1dWSdMPrVux2u+x2e7P1NpstaAdpMOfGdfQ5NOhz67kbbj9wuBuj/BofqcJ9LHE8h06ge+3PXH69zdiyLM2ZM0ebNm3Sjh071KdPn1s+prKyUpKUkpIiSXK5XDp8+LDOnz/vHVNeXi6Hw6H09HR/ygEAAO2UX2dQ8vLyVFxcrD/+8Y/q1q2bqqqqJEnx8fHq0qWLTp06peLiYk2cOFHdu3fXoUOHNHfuXD3yyCMaPHiwJCkzM1Pp6emaNm2aVq5cqaqqKi1YsEB5eXktniUBAAAdj19nUFatWqWamhqNHj1aKSkp3tvvf/97SVJsbKy2bdumzMxM9evXT//4j/+onJwcffjhh945YmJitHnzZsXExMjlcunv/u7vNH36dJ/PTQEAAB2bX2dQbvWGn7S0NO3ateuW8/Tu3VslJSX+bBoAAHQgfBcPAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcTqFuwAAQGS564UtbZ7jixXZAagE7RlnUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGId38QBolwLxThMA4cMZFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHL8CyvLly/Xggw+qW7duSkpK0hNPPKETJ074jLl69ary8vLUvXt33XHHHcrJyVF1dbXPmDNnzig7O1txcXFKSkrSvHnzVF9f3/a9AQAA7YJfAWXXrl3Ky8vTvn37VF5eLo/Ho8zMTF25csU7Zu7cufrwww+1ceNG7dq1S2fPntWTTz7pvb+hoUHZ2dm6du2a9u7dq3fffVfr1q3Tyy+/HLi9AgAAEc2v7+LZunWrz/K6deuUlJSkAwcO6JFHHlFNTY3WrFmj4uJijRkzRpK0du1a9e/fX/v27dOoUaNUVlamY8eOadu2bUpOTtaQIUO0dOlSzZ8/X4WFhYqNjQ3c3gEAgIjUpi8LrKmpkSQlJiZKkg4cOCCPx6OMjAzvmH79+qlXr16qqKjQqFGjVFFRoUGDBik5Odk7JisrS7Nnz9bRo0c1dOjQZttxu91yu93e5draWkmSx+ORx+Npyy400zRfoOeFL/ocGh25z/YYK3TbirZ8fuLWWnNMduTjOdSC1Wt/5mt1QGlsbNQvfvELPfTQQxo4cKAkqaqqSrGxsUpISPAZm5ycrKqqKu+Y74aTpvub7mvJ8uXLtXjx4mbry8rKFBcX19pduKny8vKgzAtf9Dk0OmKfV44I/TaXDm8M/UYjVElJSasf2xGP53AJdK/r6upue2yrA0peXp6OHDmiPXv2tHaK21ZQUKD8/Hzvcm1trdLS0pSZmSmHwxHQbXk8HpWXl2vcuHGy2WwBnRvX0efQ6Mh9HlhYGrJt2aMtLR3eqIWfRcvdGBWy7UayI4VZfj+mIx/PoRasXje9AnI7WhVQ5syZo82bN2v37t3q2bOnd73T6dS1a9d08eJFn7Mo1dXVcjqd3jH79+/3ma/pXT5NY77PbrfLbrc3W2+z2YJ2kAZzblxHn0OjI/bZ3RD6oOBujArLdiNRW47Hjng8h0uge+3PXH69i8eyLM2ZM0ebNm3Sjh071KdPH5/7hw0bJpvNpu3bt3vXnThxQmfOnJHL5ZIkuVwuHT58WOfPn/eOKS8vl8PhUHp6uj/lAACAdsqvMyh5eXkqLi7WH//4R3Xr1s17zUh8fLy6dOmi+Ph4zZgxQ/n5+UpMTJTD4dBzzz0nl8ulUaNGSZIyMzOVnp6uadOmaeXKlaqqqtKCBQuUl5fX4lkSAADQ8fgVUFatWiVJGj16tM/6tWvX6umnn5YkvfHGG4qOjlZOTo7cbreysrL09ttve8fGxMRo8+bNmj17tlwul7p27arc3FwtWbKkbXsCAADaDb8CimXd+i10nTt3VlFRkYqKim44pnfv3m26ghsAALRvfBcPAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHH8Dii7d+/WpEmTlJqaqqioKH3wwQc+9z/99NOKioryuY0fP95nzIULFzR16lQ5HA4lJCRoxowZunz5cpt2BAAAtB9+B5QrV67ogQceUFFR0Q3HjB8/XufOnfPefve73/ncP3XqVB09elTl5eXavHmzdu/erVmzZvlfPQAAaJc6+fuACRMmaMKECTcdY7fb5XQ6W7zv+PHj2rp1qz799FMNHz5ckvTWW29p4sSJeu2115SamupvSQAAoJ3xO6Dcjp07dyopKUl33nmnxowZo1deeUXdu3eXJFVUVCghIcEbTiQpIyND0dHR+uSTT/STn/yk2Xxut1tut9u7XFtbK0nyeDzyeDwBrb1pvkDPC1/0OTQ6cp/tMVbothVt+fzErbXmmOzIx3OoBavX/swX8IAyfvx4Pfnkk+rTp49OnTqlF198URMmTFBFRYViYmJUVVWlpKQk3yI6dVJiYqKqqqpanHP58uVavHhxs/VlZWWKi4sL9C5IksrLy4MyL3zR59DoiH1eOSL021w6vDH0G41QJSUlrX5sRzyewyXQva6rq7vtsQEPKJMnT/b+e9CgQRo8eLD69u2rnTt3auzYsa2as6CgQPn5+d7l2tpapaWlKTMzUw6Ho801f5fH41F5ebnGjRsnm80W0LlxHX0OjY7c54GFpSHblj3a0tLhjVr4WbTcjVEh224kO1KY5fdjOvLxHGrB6nXTKyC3Iygv8XzX3XffrR49eujkyZMaO3asnE6nzp8/7zOmvr5eFy5cuOF1K3a7XXa7vdl6m80WtIM0mHPjOvocGh2xz+6G0AcFd2NUWLYbie5dWOb3Y+wxllaOkIa+usPb5y9WZAe6NHxHoJ87/Jkr6J+D8vXXX+vbb79VSkqKJMnlcunixYs6cOCAd8yOHTvU2NiokSNHBrscAAAQAfw+g3L58mWdPHnSu3z69GlVVlYqMTFRiYmJWrx4sXJycuR0OnXq1Ck9//zzuueee5SV9dfTef3799f48eM1c+ZMrV69Wh6PR3PmzNHkyZN5Bw8AAJDUijMon332mYYOHaqhQ4dKkvLz8zV06FC9/PLLiomJ0aFDh/TYY4/pvvvu04wZMzRs2DD953/+p89LNOvXr1e/fv00duxYTZw4UQ8//LDeeeedwO0VAACIaH6fQRk9erQs68ZvpSstvfWFaYmJiSouLvZ30wAAoIPgu3gAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADBOp3AXAABAa931wpY2z/HFiuwAVIJA4wwKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4fA4KAOME4rMtAEQ2zqAAAADjEFAAAIBx/A4ou3fv1qRJk5SamqqoqCh98MEHPvdblqWXX35ZKSkp6tKlizIyMvTnP//ZZ8yFCxc0depUORwOJSQkaMaMGbp8+XKbdgQAALQffgeUK1eu6IEHHlBRUVGL969cuVK/+tWvtHr1an3yySfq2rWrsrKydPXqVe+YqVOn6ujRoyovL9fmzZu1e/duzZo1q/V7AQAA2hW/L5KdMGGCJkyY0OJ9lmXpzTff1IIFC/T4449Lkv7t3/5NycnJ+uCDDzR58mQdP35cW7du1aeffqrhw4dLkt566y1NnDhRr732mlJTU9uwOwAAoD0I6Lt4Tp8+raqqKmVkZHjXxcfHa+TIkaqoqNDkyZNVUVGhhIQEbziRpIyMDEVHR+uTTz7RT37yk2bzut1uud1u73Jtba0kyePxyOPxBHIXvPMFel74os+hEal9tsdY4S7BL/Zoy+cngiNYfY60349QCNZzhz/zBTSgVFVVSZKSk5N91icnJ3vvq6qqUlJSkm8RnTopMTHRO+b7li9frsWLFzdbX1ZWpri4uECU3kx5eXlQ5oUv+hwakdbnlSPCXUHrLB3eGO4SOoRA97mkpCSg87UngX7uqKuru+2xEfE5KAUFBcrPz/cu19bWKi0tTZmZmXI4HAHdlsfjUXl5ucaNGyebzRbQuXEdfQ6NSO3zwMLScJfgF3u0paXDG7Xws2i5G6PCXU67Faw+HynMCthc7UWwnjuaXgG5HQENKE6nU5JUXV2tlJQU7/rq6moNGTLEO+b8+fM+j6uvr9eFCxe8j/8+u90uu93ebL3NZgvak24w58Z19Dk0Iq3P7obI/CPvboyK2NojSaD7HEm/G6EW6OcOf+YK6Oeg9OnTR06nU9u3b/euq62t1SeffCKXyyVJcrlcunjxog4cOOAds2PHDjU2NmrkyJGBLAcAAEQov8+gXL58WSdPnvQunz59WpWVlUpMTFSvXr30i1/8Qq+88oruvfde9enTRwsXLlRqaqqeeOIJSVL//v01fvx4zZw5U6tXr5bH49GcOXM0efJk3sEDAAAktSKgfPbZZ3r00Ue9y03XhuTm5mrdunV6/vnndeXKFc2aNUsXL17Uww8/rK1bt6pz587ex6xfv15z5szR2LFjFR0drZycHP3qV78KwO4AAID2wO+AMnr0aFnWjd/iFRUVpSVLlmjJkiU3HJOYmKji4mJ/Nw0AADoIvosHAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIzj90fdAwDQntz1wpaAzPPFiuyAzIO/4gwKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDidwl0AgPblrhe2hLsEAO0AZ1AAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYJyAB5TCwkJFRUX53Pr16+e9/+rVq8rLy1P37t11xx13KCcnR9XV1YEuAwAARLCgnEEZMGCAzp07573t2bPHe9/cuXP14YcfauPGjdq1a5fOnj2rJ598MhhlAACACBWUbzPu1KmTnE5ns/U1NTVas2aNiouLNWbMGEnS2rVr1b9/f+3bt0+jRo0KRjkAACDCBCWg/PnPf1Zqaqo6d+4sl8ul5cuXq1evXjpw4IA8Ho8yMjK8Y/v166devXqpoqLihgHF7XbL7XZ7l2trayVJHo9HHo8noLU3zRfoeeGLPodGOPpsj7FCti1T2KMtn58IDtP73J6ez4L13OHPfFGWZQX0v/RHH32ky5cv6/7779e5c+e0ePFi/eUvf9GRI0f04Ycf6plnnvEJG5I0YsQIPfroo/rnf/7nFucsLCzU4sWLm60vLi5WXFxcIMsHAABBUldXpylTpqimpkYOh+OmYwMeUL7v4sWL6t27t15//XV16dKlVQGlpTMoaWlp+uabb265g/7yeDwqLy/XuHHjZLPZAjo3rqPPoRGOPg8sLA3Jdkxij7a0dHijFn4WLXdjVLjLabc6Qp+PFGaFuwRJwXvuqK2tVY8ePW4roATlJZ7vSkhI0H333aeTJ09q3Lhxunbtmi5evKiEhATvmOrq6havWWlit9tlt9ubrbfZbEF70g3m3LiOPodGKPvsbmiffzhuh7sxqkPvf6i05z6b9nwY6OcOf+YK+uegXL58WadOnVJKSoqGDRsmm82m7du3e+8/ceKEzpw5I5fLFexSAABAhAj4GZR/+qd/0qRJk9S7d2+dPXtWixYtUkxMjH72s58pPj5eM2bMUH5+vhITE+VwOPTcc8/J5XLxDh4AAOAV8IDy9ddf62c/+5m+/fZb/eAHP9DDDz+sffv26Qc/+IEk6Y033lB0dLRycnLkdruVlZWlt99+O9BlAACACBbwgLJhw4ab3t+5c2cVFRWpqKgo0JsGAADtBN/FAwAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4Af82YwAA0Dp3vbClzXN8sSI7AJWEH2dQAACAcTiDAkBSYP7PDQAChTMoAADAOAQUAABgHAIKAAAwDgEFAAAYh4tkAQBoRwJxwbs9xtLKEQEopg04gwIAAIxDQAEAAMYhoAAAAONwDQrQDtzoNeem15EHFpbK3RAV4qoAoPU4gwIAAIxDQAEAAMbhJR4gzPgOHABojjMoAADAOAQUAABgHAIKAAAwDgEFAAAYh4tk0SEF4sLUL1ZkB6ASAEBLOIMCAACMQ0ABAADG4SUeoJX4/BIACB7OoAAAAONwBuUG2vrlau3tAspAny1oy5fYtbfeAgCaC+sZlKKiIt11113q3LmzRo4cqf3794ezHAAAYIiwBZTf//73ys/P16JFi/SnP/1JDzzwgLKysnT+/PlwlQQAAAwRtoDy+uuva+bMmXrmmWeUnp6u1atXKy4uTr/5zW/CVRIAADBEWK5BuXbtmg4cOKCCggLvuujoaGVkZKiioqLZeLfbLbfb7V2uqamRJF24cEEejyegtXk8HtXV1amTJ1oNja2/BuWef/pDm2v5pGBsm+cIlE71VwI7X6OlurrGVvX522+/bfv2A7w/pmpLn3H76HNo0OfQaer1t99+K5vNFrB5L126JEmyLOvWg60w+Mtf/mJJsvbu3euzft68edaIESOajV+0aJEliRs3bty4cePWDm5fffXVLbNCRLyLp6CgQPn5+d7lxsZGXbhwQd27d1dUVGBTdG1trdLS0vTVV1/J4XAEdG5cR59Dgz6HBn0ODfocOsHqtWVZunTpklJTU285NiwBpUePHoqJiVF1dbXP+urqajmdzmbj7Xa77Ha7z7qEhIRgliiHw8EvQAjQ59Cgz6FBn0ODPodOMHodHx9/W+PCcpFsbGyshg0bpu3bt3vXNTY2avv27XK5XOEoCQAAGCRsL/Hk5+crNzdXw4cP14gRI/Tmm2/qypUreuaZZ8JVEgAAMETYAspTTz2l//mf/9HLL7+sqqoqDRkyRFu3blVycnK4SpL015eTFi1a1OwlJQQWfQ4N+hwa9Dk06HPomNDrKMu6nff6AAAAhA5fFggAAIxDQAEAAMYhoAAAAOMQUAAAgHE6ZEApKirSXXfdpc6dO2vkyJHav3//Tcdv3LhR/fr1U+fOnTVo0CCVlJSEqNLI5k+ff/3rX+tHP/qR7rzzTt15553KyMi45X8X/JW/x3OTDRs2KCoqSk888URwC2wn/O3zxYsXlZeXp5SUFNntdt133308d9wGf/v85ptv6v7771eXLl2UlpamuXPn6urVqyGqNjLt3r1bkyZNUmpqqqKiovTBBx/c8jE7d+7U3/7t38put+uee+7RunXrgl5nWL6LJ5w2bNhgxcbGWr/5zW+so0ePWjNnzrQSEhKs6urqFsd//PHHVkxMjLVy5Urr2LFj1oIFCyybzWYdPnw4xJVHFn/7PGXKFKuoqMg6ePCgdfz4cevpp5+24uPjra+//jrElUcWf/vc5PTp09bf/M3fWD/60Y+sxx9/PDTFRjB/++x2u63hw4dbEydOtPbs2WOdPn3a2rlzp1VZWRniyiOLv31ev369ZbfbrfXr11unT5+2SktLrZSUFGvu3LkhrjyylJSUWC+99JL1/vvvW5KsTZs23XT8559/bsXFxVn5+fnWsWPHrLfeesuKiYmxtm7dGtQ6O1xAGTFihJWXl+ddbmhosFJTU63ly5e3OP6nP/2plZ2d7bNu5MiR1j/8wz8Etc5I52+fv6++vt7q1q2b9e677warxHahNX2ur6+3fvjDH1r/+q//auXm5hJQboO/fV61apV19913W9euXQtVie2Cv33Oy8uzxowZ47MuPz/feuihh4JaZ3tyOwHl+eeftwYMGOCz7qmnnrKysrKCWJlldaiXeK5du6YDBw4oIyPDuy46OloZGRmqqKho8TEVFRU+4yUpKyvrhuPRuj5/X11dnTwejxITE4NVZsRrbZ+XLFmipKQkzZgxIxRlRrzW9Pnf//3f5XK5lJeXp+TkZA0cOFDLli1TQ0NDqMqOOK3p8w9/+EMdOHDA+zLQ559/rpKSEk2cODEkNXcU4fo7GBHfZhwo33zzjRoaGpp9Wm1ycrL+67/+q8XHVFVVtTi+qqoqaHVGutb0+fvmz5+v1NTUZr8UuK41fd6zZ4/WrFmjysrKEFTYPrSmz59//rl27NihqVOnqqSkRCdPntTPf/5zeTweLVq0KBRlR5zW9HnKlCn65ptv9PDDD8uyLNXX1+vZZ5/Viy++GIqSO4wb/R2sra3V//3f/6lLly5B2W6HOoOCyLBixQpt2LBBmzZtUufOncNdTrtx6dIlTZs2Tb/+9a/Vo0ePcJfTrjU2NiopKUnvvPOOhg0bpqeeekovvfSSVq9eHe7S2pWdO3dq2bJlevvtt/WnP/1J77//vrZs2aKlS5eGuzQEQIc6g9KjRw/FxMSourraZ311dbWcTmeLj3E6nX6NR+v63OS1117TihUrtG3bNg0ePDiYZUY8f/t86tQpffHFF5o0aZJ3XWNjoySpU6dOOnHihPr27RvcoiNQa47nlJQU2Ww2xcTEeNf1799fVVVVunbtmmJjY4NacyRqTZ8XLlyoadOm6e///u8lSYMGDdKVK1c0a9YsvfTSS4qO5v/BA+FGfwcdDkfQzp5IHewMSmxsrIYNG6bt27d71zU2Nmr79u1yuVwtPsblcvmMl6Ty8vIbjkfr+ixJK1eu1NKlS7V161YNHz48FKVGNH/73K9fPx0+fFiVlZXe22OPPaZHH31UlZWVSktLC2X5EaM1x/NDDz2kkydPegOgJP33f/+3UlJSCCc30Jo+19XVNQshTaHQ4mvmAiZsfweDegmugTZs2GDZ7XZr3bp11rFjx6xZs2ZZCQkJVlVVlWVZljVt2jTrhRde8I7/+OOPrU6dOlmvvfaadfz4cWvRokW8zfg2+NvnFStWWLGxsdZ7771nnTt3znu7dOlSuHYhIvjb5+/jXTy3x98+nzlzxurWrZs1Z84c68SJE9bmzZutpKQk65VXXgnXLkQEf/u8aNEiq1u3btbvfvc76/PPP7fKysqsvn37Wj/96U/DtQsR4dKlS9bBgwetgwcPWpKs119/3Tp48KD15ZdfWpZlWS+88II1bdo07/imtxnPmzfPOn78uFVUVMTbjIPlrbfesnr16mXFxsZaI0aMsPbt2+e978c//rGVm5vrM/4Pf/iDdd9991mxsbHWgAEDrC1btoS44sjkT5979+5tSWp2W7RoUegLjzD+Hs/fRUC5ff72ee/evdbIkSMtu91u3X333darr75q1dfXh7jqyONPnz0ej1VYWGj17dvX6ty5s5WWlmb9/Oc/t/73f/839IVHkP/4j/9o8fm2qbe5ubnWj3/842aPGTJkiBUbG2vdfffd1tq1a4NeZ5RlcR4MAACYpUNdgwIAACIDAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxvl/aJDLMtUUp7YAAAAASUVORK5CYII=",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"v = np.log(df['baseScore']+0.001)\n",
|
|
"v = (v - v.min())/v.max() - 1 \n",
|
|
"v = np.clip(v, 0, 1)\n",
|
|
"df['novelty'] = v\n",
|
|
"df['novelty'].hist(bins=26)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"../samples/2025_lw_parkinson-s-law-and-the-ideology-of-statistics-1.md 3.236717700958252\n",
|
|
"../samples/2025_lw_parkinson-s-law-and-the-ideology-of-statistics-1.md 3.236717700958252\n",
|
|
"../samples/2025_lw_what-s-the-short-timeline-plan.md 2.114389657974243\n",
|
|
"../samples/2025_lw_what-s-the-short-timeline-plan.md 2.114389657974243\n",
|
|
"../samples/2025_lw_the-laws-of-large-numbers.md 1.2245203256607056\n",
|
|
"../samples/2025_lw_the-laws-of-large-numbers.md 1.2245203256607056\n",
|
|
"../samples/2025_lw_the-intelligence-curse.md 1.2061121463775635\n",
|
|
"../samples/2025_lw_the-intelligence-curse.md 1.2061121463775635\n",
|
|
"../samples/2025_lw_human-study-on-ai-spear-phishing-campaigns.md 0.9995136260986328\n",
|
|
"../samples/2025_lw_human-study-on-ai-spear-phishing-campaigns.md 0.9995136260986328\n",
|
|
"../samples/2025_lw_the-subset-parity-learning-problem-much-more-than-you-wanted.md 0.9548193216323853\n",
|
|
"../samples/2025_lw_the-subset-parity-learning-problem-much-more-than-you-wanted.md 0.9548193216323853\n",
|
|
"../samples/2025_lw_2024-in-ai-predictions.md 0.8065339922904968\n",
|
|
"../samples/2025_lw_2024-in-ai-predictions.md 0.8065339922904968\n",
|
|
"../samples/2025_lw_debating-buying-nvda-in-2019.md 0.7926478385925293\n",
|
|
"../samples/2025_lw_debating-buying-nvda-in-2019.md 0.7926478385925293\n",
|
|
"../samples/2025_lw_review-planecrash.md 0.689734160900116\n",
|
|
"../samples/2025_lw_review-planecrash.md 0.689734160900116\n",
|
|
"../samples/2024_lw_by-default-capital-will-matter-more-than-ever-after-agi.md 0.6629015207290649\n",
|
|
"../samples/2024_lw_by-default-capital-will-matter-more-than-ever-after-agi.md 0.6629015207290649\n",
|
|
"../samples/2025_lw_the-field-of-ai-alignment-a-postmortem-and-what-to-do-about.md 0.5714353919029236\n",
|
|
"../samples/2025_lw_the-field-of-ai-alignment-a-postmortem-and-what-to-do-about.md 0.5714353919029236\n",
|
|
"../samples/2024_lw_the-plan-2024-update.md 0.542655885219574\n",
|
|
"../samples/2024_lw_the-plan-2024-update.md 0.542655885219574\n",
|
|
"../samples/2025_lw_comment-on-death-and-the-gorgon.md 0.5308915376663208\n",
|
|
"../samples/2025_lw_comment-on-death-and-the-gorgon.md 0.5308915376663208\n",
|
|
"../samples/2025_lw_my-agi-safety-research-2024-review-25-plans.md 0.49594494700431824\n",
|
|
"../samples/2025_lw_my-agi-safety-research-2024-review-25-plans.md 0.49594494700431824\n",
|
|
"../samples/2025_lw_preference-inversion.md 0.48199906945228577\n",
|
|
"../samples/2025_lw_preference-inversion.md 0.48199906945228577\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def to_markdown(row: dict) -> str:\n",
|
|
" md = markdownify(row[\"htmlBody\"]).strip()\n",
|
|
"\n",
|
|
" return f\"\"\"---\n",
|
|
"title: \"{row['title'].replace('\"', \"'\")}\"\n",
|
|
"date: {row['modifiedAt']}\n",
|
|
"url: {row['pageUrl']}\n",
|
|
"novelty: {row['novelty']}\n",
|
|
"score: {row['score']}\n",
|
|
"baseScore: {row['baseScore']}\n",
|
|
"voteCount: {row['voteCount']}\n",
|
|
"---\n",
|
|
"{md}\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"\n",
|
|
"for i in range(15):\n",
|
|
" for ii in [i, -i-1]:\n",
|
|
" row = df.iloc[i]\n",
|
|
" s = to_markdown(row)\n",
|
|
" f = Path(f'../samples/{row[\"modifiedAt\"].year}_lw_{row[\"slug\"]}.md')\n",
|
|
" f.write_text(s)\n",
|
|
" print(f\"{f} {row['score']:>4}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.0rc1"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|