mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-07-02 17:00:28 +08:00
Gutenberg eBook Crawler + Datasets (#1463)
* Added Gutenberg Crawler Notebook - added configrable Jupyter Notebook with crawler + ability to save to parquet - added README.md - added requirements.txt for running in venv / collab
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
---
|
||||
dataset_info:
|
||||
features:
|
||||
- name: Text#
|
||||
dtype: int64
|
||||
- name: Issued
|
||||
dtype: timestamp[us]
|
||||
- name: Title
|
||||
dtype: string
|
||||
- name: Authors
|
||||
dtype: string
|
||||
- name: Subjects
|
||||
dtype: string
|
||||
- name: LoCC
|
||||
dtype: string
|
||||
- name: Bookshelves
|
||||
dtype: string
|
||||
- name: Body
|
||||
dtype: string
|
||||
splits:
|
||||
- name: train
|
||||
num_bytes: 193811664
|
||||
num_examples: 495
|
||||
download_size: 125691249
|
||||
dataset_size: 193811664
|
||||
license: mit
|
||||
task_categories:
|
||||
- text-generation
|
||||
- conversational
|
||||
language:
|
||||
- hu
|
||||
tags:
|
||||
- project gutenberg
|
||||
- ebook
|
||||
- gutenberg.org
|
||||
pretty_name: Hungarian langauge eBooks from Project Gutenberg
|
||||
size_categories:
|
||||
- n<1K
|
||||
---
|
||||
|
||||
# Dataset Card for "gutenberg_hu"
|
||||
|
||||
# Dataset Card for Project Gutenberg - Hungarian eBooks
|
||||
|
||||
## Dataset Description
|
||||
|
||||
- **Repository:**
|
||||
[Code](https://github.com/LAION-AI/Open-Assistant/openassistant/datasets/gutenberg/)
|
||||
|
||||
## Source data
|
||||
|
||||
Please **READ** the site's TOS before running the crawler Notebook and follow
|
||||
these instructions:
|
||||
|
||||
- The website will IP ban crawlers for going through each book's metadata page
|
||||
separately. Instead use `catalog()` to access the list of available E-books.
|
||||
For more information, visit: https://www.gutenberg.org/ebooks/feeds.html
|
||||
- You can avoid running the crawler by mirroring the entire database of Project
|
||||
Gutenberg or use one of their FTPs instead, and then call the `parse()`
|
||||
function on each text
|
||||
- For more on robot access see:
|
||||
https://www.gutenberg.org/policy/robot_access.html
|
||||
|
||||
How does it work?
|
||||
|
||||
- The crawler downloads the raw HTML code for each E-book based on **Text#** id
|
||||
(if available)
|
||||
- The metadata and the body of text are not clearly separated so the parser will
|
||||
try to split them, then remove transcriber's notes and e-book related
|
||||
information from the body of text (text marked as copyrighted or malformed
|
||||
will be skipped)
|
||||
- If there is text both the metadata and the cleared body of text are saved, the
|
||||
latter is then added to a filtered parquet file (will contain only the catalog
|
||||
information and body of text for the books that were successfully retrieved)
|
||||
|
||||
Copyright notice:
|
||||
|
||||
- Some of the books are copyrighted! The crawler (parser) will ignore all books
|
||||
with an english copyright header by utilizing a regex expression, but make
|
||||
sure to check out the metadata for each book manually to ensure they are okay
|
||||
to use in your country! More information on copyright:
|
||||
https://www.gutenberg.org/help/copyright.html and
|
||||
https://www.gutenberg.org/policy/permission.html
|
||||
- Project Gutenberg has the following requests when using books without
|
||||
metadata: _Books obtianed from the Project Gutenberg site should have the
|
||||
following legal note next to them: "This eBook is for the use of anyone
|
||||
anywhere in the United States and most other parts of the world at no cost and
|
||||
with almost" no restrictions whatsoever. You may copy it, give it away or
|
||||
re-use it under the terms of the Project Gutenberg License included with this
|
||||
eBook or online at www.gutenberg.org. If you are not located in the United
|
||||
States, you will have to check the laws of the country where you are located
|
||||
before using this eBook."_
|
||||
@@ -0,0 +1,16 @@
|
||||
from datasets import concatenate_datasets, load_dataset
|
||||
|
||||
|
||||
def load(languages: list = ["en", "de", "fr", "es", "it", "pt", "nl", "hu"]):
|
||||
ds = None
|
||||
for lang in languages:
|
||||
if ds is None:
|
||||
ds = load_dataset(f"sedthh/gutenberg_{lang}")
|
||||
else:
|
||||
ds = concatenate_datasets([ds, f"sedthh/gutenberg_{lang}"])
|
||||
return ds
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ds = load()
|
||||
print(ds["train"])
|
||||
@@ -0,0 +1,644 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Project Gutenber Crawler\n",
|
||||
"\n",
|
||||
"Make sure you read the site's TOS and the notebook's README.md on how to use the crawler."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/notebooks/gutenberg/project_gutenberg_crawler.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# uncomment and run below lines to set up if running in colab\n",
|
||||
"# !git clone https://github.com/LAION-AI/Open-Assistant.git\n",
|
||||
"# %cd Open-Assistant/notebooks/gutenberg\n",
|
||||
"# !pip install -r requirements.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# global settings\n",
|
||||
"\n",
|
||||
"LANG = (\n",
|
||||
" \"en\" # crawl english language books, NOTE: there are a few houndred books with multiple languages such as 'en; es'\n",
|
||||
")\n",
|
||||
"FOLDER = \"text\" # save metadata and body of text to this folder\n",
|
||||
"CHUNKS = 1 # optionally divide the dataset into this many compressed parquet files if you have less memory\n",
|
||||
"STATUS = \"crawled.csv\" # save the list of downloaded files and their status into this csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import required packages\n",
|
||||
"import os\n",
|
||||
"import io\n",
|
||||
"import re\n",
|
||||
"import requests\n",
|
||||
"import time\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" from BeautifulSoup import BeautifulSoup\n",
|
||||
"except ImportError:\n",
|
||||
" from bs4 import BeautifulSoup\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"from typing import Tuple, Optional, Any"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Code for crawler"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class GutenbergCrawler:\n",
|
||||
"\n",
|
||||
" HEADER = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (compatible; GutenbergCrawler/0.1)\",\n",
|
||||
" }\n",
|
||||
" TIMER = 600 # wait ms between calls\n",
|
||||
" MIRRORS = [\n",
|
||||
" \"http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/\",\n",
|
||||
" \"https://www.gutenberg.org/dirs/\",\n",
|
||||
" \"http://mirrors.xmission.com/gutenberg/\",\n",
|
||||
" ] # see https://www.gutenberg.org/MIRRORS.ALL for available mirrors\n",
|
||||
"\n",
|
||||
" def __init__(self, folder: Optional[str] = None) -> None:\n",
|
||||
" self.folder = folder\n",
|
||||
" if self.folder is not None:\n",
|
||||
" os.makedirs(self.folder, exist_ok=True)\n",
|
||||
" self.calls = 0\n",
|
||||
" self.last_call = 0\n",
|
||||
"\n",
|
||||
" def _get(self, url: str) -> str:\n",
|
||||
" self.calls += 1\n",
|
||||
" diff = max(0.0, self.TIMER - (time.time() - self.last_call))\n",
|
||||
" if diff:\n",
|
||||
" time.sleep(diff / 1000.0)\n",
|
||||
" data = requests.get(url, headers=self.HEADER)\n",
|
||||
" self.last_call = time.time()\n",
|
||||
" if data.status_code == 404:\n",
|
||||
" return None\n",
|
||||
" try:\n",
|
||||
" return data.content.decode(\"utf-8\")\n",
|
||||
" except UnicodeDecodeError:\n",
|
||||
" try:\n",
|
||||
" return data.content.decode(\"ISO-8859-1\") # latin-1\n",
|
||||
" except UnicodeDecodeError:\n",
|
||||
" return data.content.decode(\"utf-8\", \"backslashreplace\")\n",
|
||||
"\n",
|
||||
" def catalog(self) -> pd.DataFrame:\n",
|
||||
" try:\n",
|
||||
" csv = pd.read_csv(\"https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv.gz\", sep=\",\")\n",
|
||||
" except Exception:\n",
|
||||
" raw = self._get(\"https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv\")\n",
|
||||
" if raw is None:\n",
|
||||
" raise ValueError(\"Catalog CSV file does not exist!\")\n",
|
||||
" csv = pd.read_csv(io.StringIO(raw), sep=\",\")\n",
|
||||
" return csv.loc[csv[\"Type\"] == \"Text\"].reset_index(drop=True)\n",
|
||||
"\n",
|
||||
" def search(self, url: str) -> dict:\n",
|
||||
" \"\"\"Use catalog() instead! Returns dict with book_id: 'book title' pairs for gutenberg.org pages\"\"\"\n",
|
||||
" assert \"/www.gutenberg.org\" in url, \"The URL must be a page at https://www.gutenberg.org/\"\n",
|
||||
" html = self._get(url)\n",
|
||||
" if html is None:\n",
|
||||
" return {}\n",
|
||||
" dom = BeautifulSoup(html, \"html.parser\")\n",
|
||||
" results = {}\n",
|
||||
" for a in dom.find_all(\"a\"):\n",
|
||||
" for elem in re.findall(r\"<a href=\\\"/ebooks/(\\d+)\\\">(.+?)</a>\", str(a)):\n",
|
||||
" ebook, title = elem\n",
|
||||
" results[int(ebook)] = title.replace(\"\\r<br/>\", \"\\r\\n\")\n",
|
||||
" return results\n",
|
||||
"\n",
|
||||
" def download(self, book: int) -> Optional[str]:\n",
|
||||
" book = int(book)\n",
|
||||
" assert book > 0\n",
|
||||
" mirror = np.random.choice(self.MIRRORS)\n",
|
||||
" if book < 10:\n",
|
||||
" page = f\"0/{book}/\"\n",
|
||||
" else:\n",
|
||||
" page = \"/\".join([char for char in str(book)[:-1]]) + f\"/{book}/\"\n",
|
||||
" url = f\"{mirror}{page}{book}-h/{book}-h.htm\"\n",
|
||||
" return self._get(url)\n",
|
||||
"\n",
|
||||
" def parse(self, book: int, html: str) -> Tuple[Optional[str], Optional[str]]:\n",
|
||||
" book = int(book)\n",
|
||||
" assert book > 0\n",
|
||||
" if html is None:\n",
|
||||
" return None, None\n",
|
||||
" dom = BeautifulSoup(html, \"html.parser\")\n",
|
||||
" if dom is None or dom.title is None or dom.title.string is None or \"404\" in dom.title.string:\n",
|
||||
" return None, None\n",
|
||||
"\n",
|
||||
" meta = \"\"\n",
|
||||
" for pre in dom.select(\"title, pre\"):\n",
|
||||
" meta += str(pre.get_text()).strip()\n",
|
||||
" # remove metadata from dom afterwards\n",
|
||||
" pre.extract()\n",
|
||||
" if re.findall(r\"(?i)\\*{2,}[^\\n]+?(?:please.+?copyright|copyrighted.+?project)[^\\n]+?\\*{2,}\\r?\\n\", meta):\n",
|
||||
" warnings.warn(f\"Book {book} is copyrighted.\")\n",
|
||||
" return None, None\n",
|
||||
" for img in dom.select(\"img\"):\n",
|
||||
" # add image alt attributes as text\n",
|
||||
" try:\n",
|
||||
" img.insert(0, img[\"alt\"])\n",
|
||||
" except KeyError:\n",
|
||||
" pass\n",
|
||||
" text = str(dom.get_text()).strip()\n",
|
||||
" if re.findall(r\"(?i)\\*{2,}[^\\n]+?(?:please.+?copyright|copyrighted.+?project)[^\\n]+?\\*{2,}\\r?\\n\", text):\n",
|
||||
" warnings.warn(f\"Book {book} is copyrighted.\")\n",
|
||||
" return None, None\n",
|
||||
"\n",
|
||||
" s = re.split(r\"(?i)\\*{2,}[^\\n]+?project gutenberg[^\\n]+?\\*{2,}\\s*[\\r\\n]+\", text) # 49843\n",
|
||||
" if len(s) > 1:\n",
|
||||
" if len(s) > 3:\n",
|
||||
" warnings.warn(f\"Book {book} is malformed.\")\n",
|
||||
" return None, None\n",
|
||||
" meta += s[0]\n",
|
||||
" return meta, s[1]\n",
|
||||
" return meta, text\n",
|
||||
"\n",
|
||||
" @staticmethod\n",
|
||||
" def pretty(text: Optional[str]) -> str:\n",
|
||||
" if not text:\n",
|
||||
" return \"\"\n",
|
||||
" # attempt to remove transcriber's notes\n",
|
||||
" text = re.sub(r\"(?i)(?:\\[|\\b)transcriber[\\'’]?s? notes?\\s*(?:[^\\xa0\\n].*?\\]?(?:\\r?\\n){1,2})+\", \"\", text)\n",
|
||||
" # attempt to remove e-text info\n",
|
||||
" text = re.sub(\n",
|
||||
" r\"(?i)e-text prepared(?:[^\\xa0]\\(?.+\\)?\\r?\\n{1,3})+(?:\\xa0*\\s*note\\:\\s*(?:.+\\s*\\r{0,2}\\n{1,2}){1,5}\\xa0\\s+)?\",\n",
|
||||
" \"\",\n",
|
||||
" text,\n",
|
||||
" )\n",
|
||||
" # standardize line endings\n",
|
||||
" text = \"\\r\\n\".join(text.splitlines())\n",
|
||||
" text = re.sub(r\"(\\r\\n){3,}\", \"\\r\\n\\r\\n\\r\\n\", text).strip()\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
" def _write(self, file: str, content: str) -> None:\n",
|
||||
" path = os.path.join(self.folder, file) if self.folder is not None else file\n",
|
||||
" with open(path, \"w+\", encoding=\"utf-8\") as f:\n",
|
||||
" f.write(content)\n",
|
||||
"\n",
|
||||
" def save(self, book: int) -> bool:\n",
|
||||
" html = self.download(book)\n",
|
||||
" meta, text = self.parse(book, html)\n",
|
||||
" if meta:\n",
|
||||
" self._write(f\"{book}_meta.txt\", meta)\n",
|
||||
" if text:\n",
|
||||
" self._write(f\"{book}_text.txt\", text)\n",
|
||||
" return bool(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Start crawling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gc = GutenbergCrawler(FOLDER) # use text/ folder to save files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Text#</th>\n",
|
||||
" <th>Type</th>\n",
|
||||
" <th>Issued</th>\n",
|
||||
" <th>Title</th>\n",
|
||||
" <th>Language</th>\n",
|
||||
" <th>Authors</th>\n",
|
||||
" <th>Subjects</th>\n",
|
||||
" <th>LoCC</th>\n",
|
||||
" <th>Bookshelves</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>9936</th>\n",
|
||||
" <td>10631</td>\n",
|
||||
" <td>Text</td>\n",
|
||||
" <td>2004-01-01</td>\n",
|
||||
" <td>Halleck's New English Literature</td>\n",
|
||||
" <td>en</td>\n",
|
||||
" <td>Halleck, Reuben Post, 1859-1936</td>\n",
|
||||
" <td>English literature -- History and criticism</td>\n",
|
||||
" <td>PR</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20456</th>\n",
|
||||
" <td>21443</td>\n",
|
||||
" <td>Text</td>\n",
|
||||
" <td>2007-05-15</td>\n",
|
||||
" <td>Vesty of the Basins</td>\n",
|
||||
" <td>en</td>\n",
|
||||
" <td>Greene, Sarah Pratt McLean, 1856-1935</td>\n",
|
||||
" <td>Maine -- Fiction</td>\n",
|
||||
" <td>PS</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>35395</th>\n",
|
||||
" <td>36710</td>\n",
|
||||
" <td>Text</td>\n",
|
||||
" <td>2011-07-12</td>\n",
|
||||
" <td>The Black Opal</td>\n",
|
||||
" <td>en</td>\n",
|
||||
" <td>Prichard, Katharine Susannah, 1884-1969</td>\n",
|
||||
" <td>Opal mines and mining -- Australia -- Fiction</td>\n",
|
||||
" <td>PR</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>27832</th>\n",
|
||||
" <td>29139</td>\n",
|
||||
" <td>Text</td>\n",
|
||||
" <td>2009-06-17</td>\n",
|
||||
" <td>No Pets Allowed</td>\n",
|
||||
" <td>en</td>\n",
|
||||
" <td>Cummings, Monette, 1914-1999</td>\n",
|
||||
" <td>Science fiction; Short stories</td>\n",
|
||||
" <td>PS</td>\n",
|
||||
" <td>Science Fiction</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>67010</th>\n",
|
||||
" <td>68338</td>\n",
|
||||
" <td>Text</td>\n",
|
||||
" <td>2022-06-17</td>\n",
|
||||
" <td>Nick Carter Stories No. 160, October 2, 1915: ...</td>\n",
|
||||
" <td>en</td>\n",
|
||||
" <td>Carter, Nicholas (House name); Lebhar, Bertram...</td>\n",
|
||||
" <td>Popular literature -- Periodicals; Detective a...</td>\n",
|
||||
" <td>PS</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Text# Type Issued \\\n",
|
||||
"9936 10631 Text 2004-01-01 \n",
|
||||
"20456 21443 Text 2007-05-15 \n",
|
||||
"35395 36710 Text 2011-07-12 \n",
|
||||
"27832 29139 Text 2009-06-17 \n",
|
||||
"67010 68338 Text 2022-06-17 \n",
|
||||
"\n",
|
||||
" Title Language \\\n",
|
||||
"9936 Halleck's New English Literature en \n",
|
||||
"20456 Vesty of the Basins en \n",
|
||||
"35395 The Black Opal en \n",
|
||||
"27832 No Pets Allowed en \n",
|
||||
"67010 Nick Carter Stories No. 160, October 2, 1915: ... en \n",
|
||||
"\n",
|
||||
" Authors \\\n",
|
||||
"9936 Halleck, Reuben Post, 1859-1936 \n",
|
||||
"20456 Greene, Sarah Pratt McLean, 1856-1935 \n",
|
||||
"35395 Prichard, Katharine Susannah, 1884-1969 \n",
|
||||
"27832 Cummings, Monette, 1914-1999 \n",
|
||||
"67010 Carter, Nicholas (House name); Lebhar, Bertram... \n",
|
||||
"\n",
|
||||
" Subjects LoCC Bookshelves \n",
|
||||
"9936 English literature -- History and criticism PR NaN \n",
|
||||
"20456 Maine -- Fiction PS NaN \n",
|
||||
"35395 Opal mines and mining -- Australia -- Fiction PR NaN \n",
|
||||
"27832 Science fiction; Short stories PS Science Fiction \n",
|
||||
"67010 Popular literature -- Periodicals; Detective a... PS NaN "
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# get the catalog of ebooks (only text types will be returned)\n",
|
||||
"df = gc.catalog()\n",
|
||||
"df = df.loc[df[\"Language\"] == LANG]\n",
|
||||
"assert len(df), \"No matching items in catalog!\"\n",
|
||||
"df = df.sample(frac=1) # random shuffle\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 out of 55128 items done.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"if os.path.exists(STATUS):\n",
|
||||
" crawled = pd.read_csv(STATUS)\n",
|
||||
"else:\n",
|
||||
" crawled = pd.DataFrame({\"book\": [], \"success\": []})\n",
|
||||
"print(f\"{len(crawled)} out of {len(df)} items done.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"#39750 To Your Dog and to My Dog (en) ✔️ - 1.626s\n",
|
||||
"#22585 —And Devious the Line of Duty (en) ✔️ - 1.634s\n",
|
||||
"#40234 The Early Introduction of Bogus Freemasonry in the United States of America and Texas Among Colored Masons (en) ✔️ - 1.634s\n",
|
||||
"#64470 Biblical Revision, its duties and conditions\n",
|
||||
"A sermon preached in St. Paul's Cathedral at the special evening service, on Sunday, March 13, 1870 (en) ✔️ - 1.402s\n",
|
||||
"#66291 The Plagiarist From Rigel IV (en) ✔️ - 1.479s\n",
|
||||
"#46746 The Chronicles of Newgate, vol. 2/2 (en) ✔️ - 1.933s\n",
|
||||
"#39744 Arne; Early Tales and Sketches\n",
|
||||
"Patriots Edition (en) ✔️ - 1.917s\n",
|
||||
"#30681 Nanny Merry\n",
|
||||
"or, What Made the Difference? (en) ✔️ - 1.184s\n",
|
||||
"#68752 Lives and exploits of the most noted highwaymen, robbers and murderers of all nations\n",
|
||||
"Drawn from the most authentic sources and brought down to the present time (en) ✔️ - 1.300s\n",
|
||||
"#21364 The Rajah of Dah (en) ✔️ - 2.184s\n",
|
||||
"#9942 Lectures on Ten British Mathematicians of the Nineteenth Century (en) ❌ - 1.151s\n",
|
||||
"#69725 Elementary woodworking (en) ✔️ - 1.421s\n",
|
||||
"#51011 The Ghost Camp; or, the Avengers (en) ✔️ - 2.279s\n",
|
||||
"#15175 A Century of Wrong (en) ✔️ - 1.587s\n",
|
||||
"#11058 Jack Archer: A Tale of the Crimea (en) ✔️ - 1.530s\n",
|
||||
"#25017 A Son of the Immortals (en) ✔️ - 2.391s\n",
|
||||
"#15476 The Mahabharata of Krishna-Dwaipayana Vyasa, Volume 3\n",
|
||||
"Books 8, 9, 10, 11 and 12 (en) ❌ - 1.151s\n",
|
||||
"#39870 Bert Wilson, Marathon Winner (en) ✔️ - 3.770s\n",
|
||||
"#8066 The Bible, King James version, Book 66: Revelation (en) ❌ - 1.200s\n",
|
||||
"#28034 Scenes in the Hawaiian Islands and California (en) ✔️ - 1.033s\n",
|
||||
"#61513 The Phantom Death, etc. (en) ✔️ - 1.884s\n",
|
||||
"#6180 A Romany of the Snows, vol. 1\n",
|
||||
"Being a Continuation of the Personal Histories of \"Pierre and His People\" and the Last Existing Records of Pretty Pierre (en) ❌ - 0.718s\n",
|
||||
"#47252 Mother Earth's Children: The Frolics of the Fruits and Vegetables (en) ✔️ - 1.301s\n",
|
||||
"#48566 Barracks, Bivouacs and Battles (en) ✔️ - 3.749s\n",
|
||||
"#33425 The Crimson Sweater (en) ✔️ - 1.958s\n",
|
||||
"▶▶▶ 25 done (21 successful) out of 55128 ◀◀◀\n",
|
||||
"Done.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# NOTE: this will take really long depending on the number of ebooks selected\n",
|
||||
"for index, row in df.iterrows():\n",
|
||||
" book = row[\"Text#\"]\n",
|
||||
" if book not in crawled[\"book\"].values:\n",
|
||||
" t = time.time()\n",
|
||||
" print(f\"#{book} {row['Title']} ({row['Language']})\", end=\" \")\n",
|
||||
" if gc.save(book):\n",
|
||||
" print(\"✔️\", end=\" \")\n",
|
||||
" crawled = crawled.append({\"book\": book, \"success\": True}, ignore_index=True)\n",
|
||||
" else:\n",
|
||||
" print(\"❌\", end=\" \")\n",
|
||||
" crawled = crawled.append({\"book\": book, \"success\": False}, ignore_index=True)\n",
|
||||
" print(f\"- {(time.time() - t):.3f}s\")\n",
|
||||
" crawled.to_csv(STATUS, index=False)\n",
|
||||
" if len(crawled) % 25 == 0:\n",
|
||||
" print(f\"▶▶▶ {len(crawled)} done ({int(crawled['success'].sum()) } successful) out of {len(df)} ◀◀◀\")\n",
|
||||
"\n",
|
||||
"print(\"Done.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Add the crawled text files into parquet datasets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"21 out of 55128 (0.04%) available.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"crawled = pd.read_csv(STATUS)\n",
|
||||
"crawled = crawled.loc[crawled[\"success\"] == True]\n",
|
||||
"crawled.rename(columns={\"book\": \"Text#\"}, inplace=True)\n",
|
||||
"\n",
|
||||
"gc = GutenbergCrawler(FOLDER)\n",
|
||||
"df = gc.catalog()\n",
|
||||
"df = df.loc[df[\"Language\"] == LANG]\n",
|
||||
"\n",
|
||||
"print(f\"{len(crawled)} out of {len(df)} ({len(crawled) / len(df) * 100.:.2f}%) available.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"21"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.drop_duplicates(subset=[\"Text#\"], inplace=True)\n",
|
||||
"df = pd.merge(df, crawled, on=[\"Text#\"], how=\"inner\")\n",
|
||||
"assert not len(df.loc[df[\"success\"] == False])\n",
|
||||
"del crawled\n",
|
||||
"df.drop(columns=[\"Type\", \"Language\", \"success\"], inplace=True)\n",
|
||||
"df.sort_values(by=\"Text#\", ascending=True, inplace=True)\n",
|
||||
"len(df) # number of items after merging with metadata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 19.45it/s]\n",
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.30it/s]\n",
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 33.12it/s]\n",
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 15.04it/s]\n",
|
||||
"100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 27.61it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Done.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def read(file: str) -> Optional[str]:\n",
|
||||
" result = None\n",
|
||||
" if os.path.exists(file):\n",
|
||||
" with open(file, \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" result = f.read()\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def strip(value: Any) -> str:\n",
|
||||
" return str(value).strip() if value and pd.notna(value) else \"\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for chunk in range(CHUNKS):\n",
|
||||
" n = len(df) // CHUNKS\n",
|
||||
" start, end = chunk * n, (chunk + 1) * n if chunk < CHUNKS - 1 else len(df)\n",
|
||||
"\n",
|
||||
" updated = {col: [] for col in list(df.columns) + [\"Body\"]}\n",
|
||||
" books = df[\"Text#\"].values[start:end]\n",
|
||||
" for book in tqdm(books):\n",
|
||||
" text = read(os.path.join(FOLDER, f\"{book}_text.txt\"))\n",
|
||||
" text = gc.pretty(text)\n",
|
||||
" if not text:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" df_row = df.loc[df[\"Text#\"] == book]\n",
|
||||
" updated[\"Text#\"].append(book)\n",
|
||||
" updated[\"Issued\"].append(pd.to_datetime(df_row[\"Issued\"].values[0], format=\"%Y-%m-%d\", errors=\"coerce\"))\n",
|
||||
" updated[\"Title\"].append(strip(df_row[\"Title\"].values[0]))\n",
|
||||
" updated[\"Authors\"].append(strip(df_row[\"Authors\"].values[0]))\n",
|
||||
" updated[\"Subjects\"].append(strip(df_row[\"Subjects\"].values[0]))\n",
|
||||
" updated[\"LoCC\"].append(strip(df_row[\"LoCC\"].values[0]))\n",
|
||||
" updated[\"Bookshelves\"].append(strip(df_row[\"Bookshelves\"].values[0]))\n",
|
||||
" updated[\"Body\"].append(text)\n",
|
||||
"\n",
|
||||
" updated = pd.DataFrame(updated)\n",
|
||||
" if CHUNKS == 1:\n",
|
||||
" updated.to_parquet(f\"gutenberg_{LANG}_all.pq\", index=False, engine=\"pyarrow\", compression=\"gzip\")\n",
|
||||
" else:\n",
|
||||
" updated.to_parquet(\n",
|
||||
" f\"gutenberg_{LANG}_{chunk + 1}_of_{CHUNKS}.pq\", index=False, engine=\"pyarrow\", compression=\"gzip\"\n",
|
||||
" )\n",
|
||||
" del updated\n",
|
||||
"\n",
|
||||
"print(\"Done.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
beautifulsoup4
|
||||
numpy
|
||||
pandas
|
||||
pyarrow
|
||||
requests
|
||||
Reference in New Issue
Block a user