diff --git a/notebooks/gutenberg/README.md b/notebooks/gutenberg/README.md new file mode 100644 index 00000000..d30a5943 --- /dev/null +++ b/notebooks/gutenberg/README.md @@ -0,0 +1,92 @@ +--- +dataset_info: + features: + - name: Text# + dtype: int64 + - name: Issued + dtype: timestamp[us] + - name: Title + dtype: string + - name: Authors + dtype: string + - name: Subjects + dtype: string + - name: LoCC + dtype: string + - name: Bookshelves + dtype: string + - name: Body + dtype: string + splits: + - name: train + num_bytes: 193811664 + num_examples: 495 + download_size: 125691249 + dataset_size: 193811664 +license: mit +task_categories: + - text-generation + - conversational +language: + - hu +tags: + - project gutenberg + - ebook + - gutenberg.org +pretty_name: Hungarian langauge eBooks from Project Gutenberg +size_categories: + - n<1K +--- + +# Dataset Card for "gutenberg_hu" + +# Dataset Card for Project Gutenberg - Hungarian eBooks + +## Dataset Description + +- **Repository:** + [Code](https://github.com/LAION-AI/Open-Assistant/openassistant/datasets/gutenberg/) + +## Source data + +Please **READ** the site's TOS before running the crawler Notebook and follow +these instructions: + +- The website will IP ban crawlers for going through each book's metadata page + separately. Instead use `catalog()` to access the list of available E-books. + For more information, visit: https://www.gutenberg.org/ebooks/feeds.html +- You can avoid running the crawler by mirroring the entire database of Project + Gutenberg or use one of their FTPs instead, and then call the `parse()` + function on each text +- For more on robot access see: + https://www.gutenberg.org/policy/robot_access.html + +How does it work? + +- The crawler downloads the raw HTML code for each E-book based on **Text#** id + (if available) +- The metadata and the body of text are not clearly separated so the parser will + try to split them, then remove transcriber's notes and e-book related + information from the body of text (text marked as copyrighted or malformed + will be skipped) +- If there is text both the metadata and the cleared body of text are saved, the + latter is then added to a filtered parquet file (will contain only the catalog + information and body of text for the books that were successfully retrieved) + +Copyright notice: + +- Some of the books are copyrighted! The crawler (parser) will ignore all books + with an english copyright header by utilizing a regex expression, but make + sure to check out the metadata for each book manually to ensure they are okay + to use in your country! More information on copyright: + https://www.gutenberg.org/help/copyright.html and + https://www.gutenberg.org/policy/permission.html +- Project Gutenberg has the following requests when using books without + metadata: _Books obtianed from the Project Gutenberg site should have the + following legal note next to them: "This eBook is for the use of anyone + anywhere in the United States and most other parts of the world at no cost and + with almost" no restrictions whatsoever. You may copy it, give it away or + re-use it under the terms of the Project Gutenberg License included with this + eBook or online at www.gutenberg.org. If you are not located in the United + States, you will have to check the laws of the country where you are located + before using this eBook."_ diff --git a/notebooks/gutenberg/__init__.py b/notebooks/gutenberg/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/notebooks/gutenberg/hub.py b/notebooks/gutenberg/hub.py new file mode 100644 index 00000000..e7d38166 --- /dev/null +++ b/notebooks/gutenberg/hub.py @@ -0,0 +1,16 @@ +from datasets import concatenate_datasets, load_dataset + + +def load(languages: list = ["en", "de", "fr", "es", "it", "pt", "nl", "hu"]): + ds = None + for lang in languages: + if ds is None: + ds = load_dataset(f"sedthh/gutenberg_{lang}") + else: + ds = concatenate_datasets([ds, f"sedthh/gutenberg_{lang}"]) + return ds + + +if __name__ == "__main__": + ds = load() + print(ds["train"]) diff --git a/notebooks/gutenberg/project_gutenberg_crawler.ipynb b/notebooks/gutenberg/project_gutenberg_crawler.ipynb new file mode 100644 index 00000000..76a19bb2 --- /dev/null +++ b/notebooks/gutenberg/project_gutenberg_crawler.ipynb @@ -0,0 +1,644 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Project Gutenber Crawler\n", + "\n", + "Make sure you read the site's TOS and the notebook's README.md on how to use the crawler." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/notebooks/gutenberg/project_gutenberg_crawler.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment and run below lines to set up if running in colab\n", + "# !git clone https://github.com/LAION-AI/Open-Assistant.git\n", + "# %cd Open-Assistant/notebooks/gutenberg\n", + "# !pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# global settings\n", + "\n", + "LANG = (\n", + " \"en\" # crawl english language books, NOTE: there are a few houndred books with multiple languages such as 'en; es'\n", + ")\n", + "FOLDER = \"text\" # save metadata and body of text to this folder\n", + "CHUNKS = 1 # optionally divide the dataset into this many compressed parquet files if you have less memory\n", + "STATUS = \"crawled.csv\" # save the list of downloaded files and their status into this csv" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import required packages\n", + "import os\n", + "import io\n", + "import re\n", + "import requests\n", + "import time\n", + "import warnings\n", + "\n", + "try:\n", + " from BeautifulSoup import BeautifulSoup\n", + "except ImportError:\n", + " from bs4 import BeautifulSoup\n", + "from tqdm import tqdm\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from typing import Tuple, Optional, Any" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Code for crawler" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class GutenbergCrawler:\n", + "\n", + " HEADER = {\n", + " \"User-Agent\": \"Mozilla/5.0 (compatible; GutenbergCrawler/0.1)\",\n", + " }\n", + " TIMER = 600 # wait ms between calls\n", + " MIRRORS = [\n", + " \"http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/\",\n", + " \"https://www.gutenberg.org/dirs/\",\n", + " \"http://mirrors.xmission.com/gutenberg/\",\n", + " ] # see https://www.gutenberg.org/MIRRORS.ALL for available mirrors\n", + "\n", + " def __init__(self, folder: Optional[str] = None) -> None:\n", + " self.folder = folder\n", + " if self.folder is not None:\n", + " os.makedirs(self.folder, exist_ok=True)\n", + " self.calls = 0\n", + " self.last_call = 0\n", + "\n", + " def _get(self, url: str) -> str:\n", + " self.calls += 1\n", + " diff = max(0.0, self.TIMER - (time.time() - self.last_call))\n", + " if diff:\n", + " time.sleep(diff / 1000.0)\n", + " data = requests.get(url, headers=self.HEADER)\n", + " self.last_call = time.time()\n", + " if data.status_code == 404:\n", + " return None\n", + " try:\n", + " return data.content.decode(\"utf-8\")\n", + " except UnicodeDecodeError:\n", + " try:\n", + " return data.content.decode(\"ISO-8859-1\") # latin-1\n", + " except UnicodeDecodeError:\n", + " return data.content.decode(\"utf-8\", \"backslashreplace\")\n", + "\n", + " def catalog(self) -> pd.DataFrame:\n", + " try:\n", + " csv = pd.read_csv(\"https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv.gz\", sep=\",\")\n", + " except Exception:\n", + " raw = self._get(\"https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv\")\n", + " if raw is None:\n", + " raise ValueError(\"Catalog CSV file does not exist!\")\n", + " csv = pd.read_csv(io.StringIO(raw), sep=\",\")\n", + " return csv.loc[csv[\"Type\"] == \"Text\"].reset_index(drop=True)\n", + "\n", + " def search(self, url: str) -> dict:\n", + " \"\"\"Use catalog() instead! Returns dict with book_id: 'book title' pairs for gutenberg.org pages\"\"\"\n", + " assert \"/www.gutenberg.org\" in url, \"The URL must be a page at https://www.gutenberg.org/\"\n", + " html = self._get(url)\n", + " if html is None:\n", + " return {}\n", + " dom = BeautifulSoup(html, \"html.parser\")\n", + " results = {}\n", + " for a in dom.find_all(\"a\"):\n", + " for elem in re.findall(r\"(.+?)\", str(a)):\n", + " ebook, title = elem\n", + " results[int(ebook)] = title.replace(\"\\r
\", \"\\r\\n\")\n", + " return results\n", + "\n", + " def download(self, book: int) -> Optional[str]:\n", + " book = int(book)\n", + " assert book > 0\n", + " mirror = np.random.choice(self.MIRRORS)\n", + " if book < 10:\n", + " page = f\"0/{book}/\"\n", + " else:\n", + " page = \"/\".join([char for char in str(book)[:-1]]) + f\"/{book}/\"\n", + " url = f\"{mirror}{page}{book}-h/{book}-h.htm\"\n", + " return self._get(url)\n", + "\n", + " def parse(self, book: int, html: str) -> Tuple[Optional[str], Optional[str]]:\n", + " book = int(book)\n", + " assert book > 0\n", + " if html is None:\n", + " return None, None\n", + " dom = BeautifulSoup(html, \"html.parser\")\n", + " if dom is None or dom.title is None or dom.title.string is None or \"404\" in dom.title.string:\n", + " return None, None\n", + "\n", + " meta = \"\"\n", + " for pre in dom.select(\"title, pre\"):\n", + " meta += str(pre.get_text()).strip()\n", + " # remove metadata from dom afterwards\n", + " pre.extract()\n", + " if re.findall(r\"(?i)\\*{2,}[^\\n]+?(?:please.+?copyright|copyrighted.+?project)[^\\n]+?\\*{2,}\\r?\\n\", meta):\n", + " warnings.warn(f\"Book {book} is copyrighted.\")\n", + " return None, None\n", + " for img in dom.select(\"img\"):\n", + " # add image alt attributes as text\n", + " try:\n", + " img.insert(0, img[\"alt\"])\n", + " except KeyError:\n", + " pass\n", + " text = str(dom.get_text()).strip()\n", + " if re.findall(r\"(?i)\\*{2,}[^\\n]+?(?:please.+?copyright|copyrighted.+?project)[^\\n]+?\\*{2,}\\r?\\n\", text):\n", + " warnings.warn(f\"Book {book} is copyrighted.\")\n", + " return None, None\n", + "\n", + " s = re.split(r\"(?i)\\*{2,}[^\\n]+?project gutenberg[^\\n]+?\\*{2,}\\s*[\\r\\n]+\", text) # 49843\n", + " if len(s) > 1:\n", + " if len(s) > 3:\n", + " warnings.warn(f\"Book {book} is malformed.\")\n", + " return None, None\n", + " meta += s[0]\n", + " return meta, s[1]\n", + " return meta, text\n", + "\n", + " @staticmethod\n", + " def pretty(text: Optional[str]) -> str:\n", + " if not text:\n", + " return \"\"\n", + " # attempt to remove transcriber's notes\n", + " text = re.sub(r\"(?i)(?:\\[|\\b)transcriber[\\'’]?s? notes?\\s*(?:[^\\xa0\\n].*?\\]?(?:\\r?\\n){1,2})+\", \"\", text)\n", + " # attempt to remove e-text info\n", + " text = re.sub(\n", + " r\"(?i)e-text prepared(?:[^\\xa0]\\(?.+\\)?\\r?\\n{1,3})+(?:\\xa0*\\s*note\\:\\s*(?:.+\\s*\\r{0,2}\\n{1,2}){1,5}\\xa0\\s+)?\",\n", + " \"\",\n", + " text,\n", + " )\n", + " # standardize line endings\n", + " text = \"\\r\\n\".join(text.splitlines())\n", + " text = re.sub(r\"(\\r\\n){3,}\", \"\\r\\n\\r\\n\\r\\n\", text).strip()\n", + " return text\n", + "\n", + " def _write(self, file: str, content: str) -> None:\n", + " path = os.path.join(self.folder, file) if self.folder is not None else file\n", + " with open(path, \"w+\", encoding=\"utf-8\") as f:\n", + " f.write(content)\n", + "\n", + " def save(self, book: int) -> bool:\n", + " html = self.download(book)\n", + " meta, text = self.parse(book, html)\n", + " if meta:\n", + " self._write(f\"{book}_meta.txt\", meta)\n", + " if text:\n", + " self._write(f\"{book}_text.txt\", text)\n", + " return bool(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start crawling" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "gc = GutenbergCrawler(FOLDER) # use text/ folder to save files" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Text#TypeIssuedTitleLanguageAuthorsSubjectsLoCCBookshelves
993610631Text2004-01-01Halleck's New English LiteratureenHalleck, Reuben Post, 1859-1936English literature -- History and criticismPRNaN
2045621443Text2007-05-15Vesty of the BasinsenGreene, Sarah Pratt McLean, 1856-1935Maine -- FictionPSNaN
3539536710Text2011-07-12The Black OpalenPrichard, Katharine Susannah, 1884-1969Opal mines and mining -- Australia -- FictionPRNaN
2783229139Text2009-06-17No Pets AllowedenCummings, Monette, 1914-1999Science fiction; Short storiesPSScience Fiction
6701068338Text2022-06-17Nick Carter Stories No. 160, October 2, 1915: ...enCarter, Nicholas (House name); Lebhar, Bertram...Popular literature -- Periodicals; Detective a...PSNaN
\n", + "
" + ], + "text/plain": [ + " Text# Type Issued \\\n", + "9936 10631 Text 2004-01-01 \n", + "20456 21443 Text 2007-05-15 \n", + "35395 36710 Text 2011-07-12 \n", + "27832 29139 Text 2009-06-17 \n", + "67010 68338 Text 2022-06-17 \n", + "\n", + " Title Language \\\n", + "9936 Halleck's New English Literature en \n", + "20456 Vesty of the Basins en \n", + "35395 The Black Opal en \n", + "27832 No Pets Allowed en \n", + "67010 Nick Carter Stories No. 160, October 2, 1915: ... en \n", + "\n", + " Authors \\\n", + "9936 Halleck, Reuben Post, 1859-1936 \n", + "20456 Greene, Sarah Pratt McLean, 1856-1935 \n", + "35395 Prichard, Katharine Susannah, 1884-1969 \n", + "27832 Cummings, Monette, 1914-1999 \n", + "67010 Carter, Nicholas (House name); Lebhar, Bertram... \n", + "\n", + " Subjects LoCC Bookshelves \n", + "9936 English literature -- History and criticism PR NaN \n", + "20456 Maine -- Fiction PS NaN \n", + "35395 Opal mines and mining -- Australia -- Fiction PR NaN \n", + "27832 Science fiction; Short stories PS Science Fiction \n", + "67010 Popular literature -- Periodicals; Detective a... PS NaN " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the catalog of ebooks (only text types will be returned)\n", + "df = gc.catalog()\n", + "df = df.loc[df[\"Language\"] == LANG]\n", + "assert len(df), \"No matching items in catalog!\"\n", + "df = df.sample(frac=1) # random shuffle\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 out of 55128 items done.\n" + ] + } + ], + "source": [ + "if os.path.exists(STATUS):\n", + " crawled = pd.read_csv(STATUS)\n", + "else:\n", + " crawled = pd.DataFrame({\"book\": [], \"success\": []})\n", + "print(f\"{len(crawled)} out of {len(df)} items done.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#39750 To Your Dog and to My Dog (en) ✔️ - 1.626s\n", + "#22585 —And Devious the Line of Duty (en) ✔️ - 1.634s\n", + "#40234 The Early Introduction of Bogus Freemasonry in the United States of America and Texas Among Colored Masons (en) ✔️ - 1.634s\n", + "#64470 Biblical Revision, its duties and conditions\n", + "A sermon preached in St. Paul's Cathedral at the special evening service, on Sunday, March 13, 1870 (en) ✔️ - 1.402s\n", + "#66291 The Plagiarist From Rigel IV (en) ✔️ - 1.479s\n", + "#46746 The Chronicles of Newgate, vol. 2/2 (en) ✔️ - 1.933s\n", + "#39744 Arne; Early Tales and Sketches\n", + "Patriots Edition (en) ✔️ - 1.917s\n", + "#30681 Nanny Merry\n", + "or, What Made the Difference? (en) ✔️ - 1.184s\n", + "#68752 Lives and exploits of the most noted highwaymen, robbers and murderers of all nations\n", + "Drawn from the most authentic sources and brought down to the present time (en) ✔️ - 1.300s\n", + "#21364 The Rajah of Dah (en) ✔️ - 2.184s\n", + "#9942 Lectures on Ten British Mathematicians of the Nineteenth Century (en) ❌ - 1.151s\n", + "#69725 Elementary woodworking (en) ✔️ - 1.421s\n", + "#51011 The Ghost Camp; or, the Avengers (en) ✔️ - 2.279s\n", + "#15175 A Century of Wrong (en) ✔️ - 1.587s\n", + "#11058 Jack Archer: A Tale of the Crimea (en) ✔️ - 1.530s\n", + "#25017 A Son of the Immortals (en) ✔️ - 2.391s\n", + "#15476 The Mahabharata of Krishna-Dwaipayana Vyasa, Volume 3\n", + "Books 8, 9, 10, 11 and 12 (en) ❌ - 1.151s\n", + "#39870 Bert Wilson, Marathon Winner (en) ✔️ - 3.770s\n", + "#8066 The Bible, King James version, Book 66: Revelation (en) ❌ - 1.200s\n", + "#28034 Scenes in the Hawaiian Islands and California (en) ✔️ - 1.033s\n", + "#61513 The Phantom Death, etc. (en) ✔️ - 1.884s\n", + "#6180 A Romany of the Snows, vol. 1\n", + "Being a Continuation of the Personal Histories of \"Pierre and His People\" and the Last Existing Records of Pretty Pierre (en) ❌ - 0.718s\n", + "#47252 Mother Earth's Children: The Frolics of the Fruits and Vegetables (en) ✔️ - 1.301s\n", + "#48566 Barracks, Bivouacs and Battles (en) ✔️ - 3.749s\n", + "#33425 The Crimson Sweater (en) ✔️ - 1.958s\n", + "▶▶▶ 25 done (21 successful) out of 55128 ◀◀◀\n", + "Done.\n" + ] + } + ], + "source": [ + "# NOTE: this will take really long depending on the number of ebooks selected\n", + "for index, row in df.iterrows():\n", + " book = row[\"Text#\"]\n", + " if book not in crawled[\"book\"].values:\n", + " t = time.time()\n", + " print(f\"#{book} {row['Title']} ({row['Language']})\", end=\" \")\n", + " if gc.save(book):\n", + " print(\"✔️\", end=\" \")\n", + " crawled = crawled.append({\"book\": book, \"success\": True}, ignore_index=True)\n", + " else:\n", + " print(\"❌\", end=\" \")\n", + " crawled = crawled.append({\"book\": book, \"success\": False}, ignore_index=True)\n", + " print(f\"- {(time.time() - t):.3f}s\")\n", + " crawled.to_csv(STATUS, index=False)\n", + " if len(crawled) % 25 == 0:\n", + " print(f\"▶▶▶ {len(crawled)} done ({int(crawled['success'].sum()) } successful) out of {len(df)} ◀◀◀\")\n", + "\n", + "print(\"Done.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Add the crawled text files into parquet datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21 out of 55128 (0.04%) available.\n" + ] + } + ], + "source": [ + "crawled = pd.read_csv(STATUS)\n", + "crawled = crawled.loc[crawled[\"success\"] == True]\n", + "crawled.rename(columns={\"book\": \"Text#\"}, inplace=True)\n", + "\n", + "gc = GutenbergCrawler(FOLDER)\n", + "df = gc.catalog()\n", + "df = df.loc[df[\"Language\"] == LANG]\n", + "\n", + "print(f\"{len(crawled)} out of {len(df)} ({len(crawled) / len(df) * 100.:.2f}%) available.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop_duplicates(subset=[\"Text#\"], inplace=True)\n", + "df = pd.merge(df, crawled, on=[\"Text#\"], how=\"inner\")\n", + "assert not len(df.loc[df[\"success\"] == False])\n", + "del crawled\n", + "df.drop(columns=[\"Type\", \"Language\", \"success\"], inplace=True)\n", + "df.sort_values(by=\"Text#\", ascending=True, inplace=True)\n", + "len(df) # number of items after merging with metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 19.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.30it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 33.12it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 15.04it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 27.61it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "def read(file: str) -> Optional[str]:\n", + " result = None\n", + " if os.path.exists(file):\n", + " with open(file, \"r\", encoding=\"utf-8\") as f:\n", + " result = f.read()\n", + " return result\n", + "\n", + "\n", + "def strip(value: Any) -> str:\n", + " return str(value).strip() if value and pd.notna(value) else \"\"\n", + "\n", + "\n", + "for chunk in range(CHUNKS):\n", + " n = len(df) // CHUNKS\n", + " start, end = chunk * n, (chunk + 1) * n if chunk < CHUNKS - 1 else len(df)\n", + "\n", + " updated = {col: [] for col in list(df.columns) + [\"Body\"]}\n", + " books = df[\"Text#\"].values[start:end]\n", + " for book in tqdm(books):\n", + " text = read(os.path.join(FOLDER, f\"{book}_text.txt\"))\n", + " text = gc.pretty(text)\n", + " if not text:\n", + " continue\n", + "\n", + " df_row = df.loc[df[\"Text#\"] == book]\n", + " updated[\"Text#\"].append(book)\n", + " updated[\"Issued\"].append(pd.to_datetime(df_row[\"Issued\"].values[0], format=\"%Y-%m-%d\", errors=\"coerce\"))\n", + " updated[\"Title\"].append(strip(df_row[\"Title\"].values[0]))\n", + " updated[\"Authors\"].append(strip(df_row[\"Authors\"].values[0]))\n", + " updated[\"Subjects\"].append(strip(df_row[\"Subjects\"].values[0]))\n", + " updated[\"LoCC\"].append(strip(df_row[\"LoCC\"].values[0]))\n", + " updated[\"Bookshelves\"].append(strip(df_row[\"Bookshelves\"].values[0]))\n", + " updated[\"Body\"].append(text)\n", + "\n", + " updated = pd.DataFrame(updated)\n", + " if CHUNKS == 1:\n", + " updated.to_parquet(f\"gutenberg_{LANG}_all.pq\", index=False, engine=\"pyarrow\", compression=\"gzip\")\n", + " else:\n", + " updated.to_parquet(\n", + " f\"gutenberg_{LANG}_{chunk + 1}_of_{CHUNKS}.pq\", index=False, engine=\"pyarrow\", compression=\"gzip\"\n", + " )\n", + " del updated\n", + "\n", + "print(\"Done.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/gutenberg/requirements.txt b/notebooks/gutenberg/requirements.txt new file mode 100644 index 00000000..757d63a3 --- /dev/null +++ b/notebooks/gutenberg/requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4 +numpy +pandas +pyarrow +requests