Files
openai-transformer-lm-guten…/download_gutenberg_erotica.ipynb
T

781 lines
44 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T02:29:54.022099Z",
"start_time": "2018-11-04T02:29:54.017857Z"
}
},
"source": [
"# Fetch guternberg books from a category"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 1, get book ids\n",
"\n",
"- go to http://m.gutenberg.org/ebooks/search.mobile/?query=Erotic+%21+bsxErotic&sort_order=downloads\n",
"\n",
"- scroll to the bottom and click \"show more\" a few times\n",
"- enter the javascript below in the browsers js console\n",
"- it should have copied the ids to your clipboard, you can paste it into \"ids\" below\n",
"\n",
"\n",
"```js\n",
"// to get all book ids shown on page, paste this javascript into js console in browser when on the page above\n",
"a_elems = document.getElementsByClassName(\"table link\")\n",
"hrefs = Array.from(a_elems)\n",
" .map(e=>e.href) // get link\n",
" .filter(e=>e) // remove empty links\n",
"ids = hrefs.map(e=>/(\\d+)\\.mobile/.exec(e)) // regular expression match\n",
" .filter(e=>e) // remove ones not found\n",
" .map(e=>e[1]) // get just id\n",
"copy(ids) // copy to clipboard\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:54:51.977129Z",
"start_time": "2018-11-04T08:54:51.641106Z"
}
},
"outputs": [],
"source": [
"import requests\n",
"import os\n",
"import re\n",
"import bs4\n",
"import time\n",
"import json\n",
"from tqdm import tqdm_notebook as tqdm\n",
"\n",
"dest_dir = 'data/corpus/erotic_gutenberg'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:54:52.021478Z",
"start_time": "2018-11-04T08:54:51.980984Z"
}
},
"outputs": [],
"source": [
"# urls to download text inputs\n",
"ids = [\n",
" \"30254\",\n",
" \"30360\",\n",
" \"28520\",\n",
" \"25305\",\n",
" \"14005\",\n",
" \"28522\",\n",
" \"31284\",\n",
" \"28521\",\n",
" \"29827\",\n",
" \"52059\",\n",
" \"14323\",\n",
" \"13610\",\n",
" \"57284\",\n",
" \"13972\",\n",
" \"52205\",\n",
" \"54672\",\n",
" \"13614\",\n",
" \"28718\",\n",
" \"44877\",\n",
" \"26804\",\n",
" \"45150\",\n",
" \"37491\",\n",
" \"43438\",\n",
" \"48943\",\n",
" \"53807\",\n",
" \"26456\",\n",
" \"26808\",\n",
" \"13971\",\n",
" \"42406\",\n",
" \"43823\",\n",
" \"39220\",\n",
" \"56779\",\n",
" \"26809\",\n",
" \"18610\",\n",
" \"44181\",\n",
" \"42212\",\n",
" \"26806\",\n",
" \"42586\",\n",
" \"47892\",\n",
" \"43822\",\n",
" \"49855\",\n",
" \"26562\",\n",
" \"26739\",\n",
" \"26807\",\n",
" \"20568\",\n",
" \"40877\",\n",
" \"54419\",\n",
" \"53944\",\n",
" \"40557\",\n",
" \"29049\",\n",
" \"25543\",\n",
" \"40902\",\n",
" \"41301\",\n",
" \"56491\",\n",
" \"28789\",\n",
" \"40496\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:54:52.038785Z",
"start_time": "2018-11-04T08:54:52.024258Z"
}
},
"outputs": [],
"source": [
"# from https://github.com/motoom/gutenberg-ebook-scraping/blob/master/gutenberg.py\n",
"\n",
"# Repetitive stuff I don't want to read a 1000 times on my eBook reader.\n",
"remove = [\"Produced by\",\"End of the Project Gutenberg\",\"End of Project Gutenberg\"]\n",
"\n",
"def beautify(text):\n",
" ''' Reads a raw Project Gutenberg etext, reformat paragraphs,\n",
" and removes fluff. Determines the title of the book'''\n",
" lines = [line.strip() for line in text.split('\\n')]\n",
" collect = False\n",
" lookforsubtitle = False\n",
" outlines = []\n",
" startseen = endseen = False\n",
" title=\"\"\n",
" author=\"\"\n",
" language=\"\"\n",
" extra=[]\n",
" for line in lines:\n",
" if line.startswith(\"Author: \"):\n",
" author = line[8:]\n",
" if line.startswith(\"Language: \"):\n",
" language = line[10:]\n",
" if line.startswith(\"Title: \"):\n",
" title = line[7:]\n",
" lookforsubtitle = True\n",
" continue\n",
" if lookforsubtitle:\n",
" if not line.strip():\n",
" lookforsubtitle = False\n",
" else:\n",
" subtitle = line.strip()\n",
" subtitle = subtitle.strip(\".\")\n",
" title += \", \" + subtitle\n",
" if (\"*** START\" in line) or (\"***START\" in line) or (line.startswith(\"*END THE SMALL PRINT!\")):\n",
" collect = startseen = True\n",
" paragraph = \"\"\n",
" extra.append(line)\n",
" continue\n",
" if (\"*** END\" in line) or (\"***END\" in line):\n",
" endseen = True\n",
" extra.append(line)\n",
" break\n",
" if not collect:\n",
" extra.append(line)\n",
" continue\n",
" if not line:\n",
" paragraph = paragraph.strip()\n",
" for term in remove:\n",
" if paragraph.startswith(term):\n",
" extra.append(line)\n",
" paragraph = \"\"\n",
" break\n",
" if paragraph:\n",
" outlines.append(paragraph)\n",
" outlines.append(\"\")\n",
" paragraph = \"\"\n",
" else:\n",
" paragraph += \" \" + line\n",
"\n",
" # Report on anomalous situations, but don't make it a showstopper.\n",
" if not title:\n",
"# print (ofn)\n",
" print (\" Problem: No title found\\n\")\n",
" if not startseen:\n",
"# print (ofn)\n",
" print (\" Problem: No '*** START' seen\\n\")\n",
" if not endseen:\n",
"# print (ofn)\n",
" print (\" Problem: No '*** END' seen\\n\")\n",
" \n",
" return dict(\n",
" content='\\n'.join(outlines),\n",
" title=title,\n",
" author=author,\n",
" language=language,\n",
" extra=extra\n",
" )\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:55:45.417570Z",
"start_time": "2018-11-04T08:54:52.040977Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2e1c657a58ac42f9bb99cc8221138bee",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=56), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" Problem: No title found\n",
"\n",
" Problem: No '*** START' seen\n",
"\n",
" Problem: No '*** END' seen\n",
"\n",
" Problem: No title found\n",
"\n",
" Problem: No '*** START' seen\n",
"\n",
" Problem: No '*** END' seen\n",
"\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-88f9382543da>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# first download index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mindex_url\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"http://www.gutenberg.org/files/{bid:}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_for_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbs4\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"html5lib\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 506\u001b[0m }\n\u001b[1;32m 507\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m )\n\u001b[1;32m 442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;31m# Reset the timeout for the recv() on the socket\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1107\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_set_content_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;31m# default charset of iso-8859-1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1102\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1103\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 875\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 877\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 878\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotConnected\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 141\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mSocketTimeout\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msource_address\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"for bid in tqdm(ids):\n",
" \n",
" # first download index\n",
" index_url = \"http://www.gutenberg.org/files/{bid:}\".format(bid=bid)\n",
" r = requests.get(index_url)\n",
" r.raise_for_status()\n",
" soup = bs4.BeautifulSoup(r.content, \"html5lib\")\n",
" hrefs = [e.attrs['href'] for e in soup.findAll('a')]\n",
" links = [h for h in hrefs if h.endswith('.txt')]\n",
" \n",
" # download text\n",
" for link in links:\n",
" txt_url = index_url + '/' + link\n",
" outfile = os.path.join(dest_dir, link.replace('.txt', '.json'))\n",
" if not os.path.isfile(outfile):\n",
" r = requests.get(txt_url)\n",
" r.raise_for_status()\n",
" info = beautify(r.text)\n",
" if (info['language'] == 'English') and len(info['language']):\n",
" # TODO some are empty, check before saving\n",
" json.dump(info, open(outfile, 'w'))\n",
" \n",
" time.sleep(0.5) # avoid ddos/ban"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T02:37:32.490814Z",
"start_time": "2018-11-04T02:37:32.485221Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. turn into csv, like rocstories"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:55:48.451860Z",
"start_time": "2018-11-04T08:55:46.818680Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>author</th>\n",
" <th>content</th>\n",
" <th>extra</th>\n",
" <th>language</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Albert Mordell</td>\n",
" <td>THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ...</td>\n",
" <td>[Project Gutenberg's The Erotic Motive in Lit...</td>\n",
" <td>English</td>\n",
" <td>The Erotic Motive in Literature</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _Laura Middlet...</td>\n",
" <td>[Project Gutenberg's Laura Middleton; Her Brot...</td>\n",
" <td>English</td>\n",
" <td>Laura Middleton; Her Brother and her Lover</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>J. K. Huysmans</td>\n",
" <td>LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
" <td>[The Project Gutenberg EBook of Là-bas, by J. ...</td>\n",
" <td>English</td>\n",
" <td>Là-bas</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>John Cleland</td>\n",
" <td>MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A...</td>\n",
" <td>[The Project Gutenberg EBook of Memoirs Of Fan...</td>\n",
" <td>English</td>\n",
" <td>Memoirs Of Fanny Hill, A New and Genuine Editi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _Forbidden fru...</td>\n",
" <td>[The Project Gutenberg EBook of Forbidden Frui...</td>\n",
" <td>English</td>\n",
" <td>Forbidden Fruit, Luscious and exciting story a...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Kate Percival</td>\n",
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
" <td>English</td>\n",
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Various</td>\n",
" <td>[Transcriber's Note: The following was proofre...</td>\n",
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
" <td>English</td>\n",
" <td>The Fifteen Comforts of Matrimony: Responses F...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
" <td>English</td>\n",
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Denis Diderot</td>\n",
" <td>_Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr...</td>\n",
" <td>[The Project Gutenberg EBook of Les Bijoux Ind...</td>\n",
" <td>English</td>\n",
" <td>Les Bijoux Indiscrets, or, The Indiscreet Toys</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Anonymous</td>\n",
" <td>THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad...</td>\n",
" <td>[The Project Gutenberg EBook of The Ladies Del...</td>\n",
" <td>English</td>\n",
" <td>The Ladies Delight</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>J. K. Huysmans</td>\n",
" <td>LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
" <td>[The Project Gutenberg EBook of La-bas, by J. ...</td>\n",
" <td>English</td>\n",
" <td>La-bas</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Friedrich Karl Forberg</td>\n",
" <td>MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig...</td>\n",
" <td>[The Project Gutenberg EBook of Manual of Cla...</td>\n",
" <td>English</td>\n",
" <td>Manual of Classical Erotology (De figuris Vene...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
" <td>English</td>\n",
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Georg Brandes</td>\n",
" <td>MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n...</td>\n",
" <td>[The Project Gutenberg EBook of Main Currents...</td>\n",
" <td>English</td>\n",
" <td>Main Currents in Nineteenth Century Literature...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Anonymous</td>\n",
" <td>The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic...</td>\n",
" <td>[, The Project Gutenberg EBook of The Romance ...</td>\n",
" <td>English</td>\n",
" <td>The Romance of Lust A classic Victorian erotic...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>L. Brovan</td>\n",
" <td>Two Hundred and fifty Copies of this Work have...</td>\n",
" <td>[The Project Gutenberg EBook of Anthologica R...</td>\n",
" <td>English</td>\n",
" <td>Anthologica Rarissima: The Way of a Virgin</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Various</td>\n",
" <td>[Transcriber's Note: The following was proofre...</td>\n",
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
" <td>English</td>\n",
" <td>The Fifteen Comforts of Matrimony: Responses f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Kate Percival</td>\n",
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
" <td>English</td>\n",
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" author content \\\n",
"0 Albert Mordell THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ... \n",
"1 Anonymous [Transcriber's note: Anonymous, _Laura Middlet... \n",
"2 J. K. Huysmans LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
"3 John Cleland MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A... \n",
"4 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"5 Anonymous [Transcriber's note: Anonymous, _Forbidden fru... \n",
"6 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
"7 Various [Transcriber's Note: The following was proofre... \n",
"8 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
"9 Denis Diderot _Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr... \n",
"10 Anonymous THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad... \n",
"11 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"12 J. K. Huysmans LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
"13 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"14 Friedrich Karl Forberg MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig... \n",
"15 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
"16 Georg Brandes MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n... \n",
"17 Anonymous The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic... \n",
"18 L. Brovan Two Hundred and fifty Copies of this Work have... \n",
"19 Various [Transcriber's Note: The following was proofre... \n",
"20 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
"21 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"\n",
" extra language \\\n",
"0 [Project Gutenberg's The Erotic Motive in Lit... English \n",
"1 [Project Gutenberg's Laura Middleton; Her Brot... English \n",
"2 [The Project Gutenberg EBook of Là-bas, by J. ... English \n",
"3 [The Project Gutenberg EBook of Memoirs Of Fan... English \n",
"4 [The Project Gutenberg eBook, Studies in the P... English \n",
"5 [The Project Gutenberg EBook of Forbidden Frui... English \n",
"6 [The Project Gutenberg EBook of The Life and A... English \n",
"7 [The Project Gutenberg EBook of The Fifteen Co... English \n",
"8 [The Project Gutenberg EBook of The Power of M... English \n",
"9 [The Project Gutenberg EBook of Les Bijoux Ind... English \n",
"10 [The Project Gutenberg EBook of The Ladies Del... English \n",
"11 [The Project Gutenberg eBook, Studies in the P... English \n",
"12 [The Project Gutenberg EBook of La-bas, by J. ... English \n",
"13 [The Project Gutenberg eBook, Studies in the P... English \n",
"14 [The Project Gutenberg EBook of Manual of Cla... English \n",
"15 [The Project Gutenberg EBook of The Power of M... English \n",
"16 [The Project Gutenberg EBook of Main Currents... English \n",
"17 [, The Project Gutenberg EBook of The Romance ... English \n",
"18 [The Project Gutenberg EBook of Anthologica R... English \n",
"19 [The Project Gutenberg EBook of The Fifteen Co... English \n",
"20 [The Project Gutenberg EBook of The Life and A... English \n",
"21 [The Project Gutenberg eBook, Studies in the P... English \n",
"\n",
" title \n",
"0 The Erotic Motive in Literature \n",
"1 Laura Middleton; Her Brother and her Lover \n",
"2 Là-bas \n",
"3 Memoirs Of Fanny Hill, A New and Genuine Editi... \n",
"4 Studies in the Psychology of Sex, Volume 1 (of 6) \n",
"5 Forbidden Fruit, Luscious and exciting story a... \n",
"6 The Life and Amours of the Beautiful, Gay and ... \n",
"7 The Fifteen Comforts of Matrimony: Responses F... \n",
"8 The Power of Mesmerism, A Highly Erotic Narrat... \n",
"9 Les Bijoux Indiscrets, or, The Indiscreet Toys \n",
"10 The Ladies Delight \n",
"11 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
"12 La-bas \n",
"13 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
"14 Manual of Classical Erotology (De figuris Vene... \n",
"15 The Power of Mesmerism, A Highly Erotic Narrat... \n",
"16 Main Currents in Nineteenth Century Literature... \n",
"17 The Romance of Lust A classic Victorian erotic... \n",
"18 Anthologica Rarissima: The Way of a Virgin \n",
"19 The Fifteen Comforts of Matrimony: Responses f... \n",
"20 The Life and Amours of the Beautiful, Gay and ... \n",
"21 Studies in the Psychology of Sex, Volume 1 (of 6) "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import uuid\n",
"import pandas as pd\n",
"import nltk\n",
"# nltk.download('punkt')\n",
"\n",
"dest_dir = 'data/corpus/erotic_gutenberg'\n",
"max_len = 400\n",
"num_sent = 6\n",
"data=[]\n",
"for infile in os.listdir(dest_dir):\n",
" path = os.path.join(dest_dir, infile)\n",
" info = json.load(open(path))\n",
" data.append(info)\n",
"df = pd.DataFrame(data)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:55:59.624853Z",
"start_time": "2018-11-04T08:55:59.621367Z"
}
},
"outputs": [],
"source": [
"# # Test: concat small sentances\n",
"# # And split large ones\n",
"# sent = []\n",
"# for s in sentances:\n",
"# if len(s)>10:\n",
"# sent.append(s)\n",
"# else:\n",
"# sent[-1]+=' '+s\n",
"# sent"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:56:56.969876Z",
"start_time": "2018-11-04T08:56:56.965223Z"
}
},
"outputs": [],
"source": [
"df = df.rename(columns=dict(content='TEXT'))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T08:56:58.359405Z",
"start_time": "2018-11-04T08:56:58.068470Z"
}
},
"outputs": [],
"source": [
"df.to_csv('data/erotic_gutenberg_dataset.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "jupyter3",
"language": "python",
"name": "jupyter3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"navigate_num": "#000000",
"navigate_text": "#333333",
"running_highlight": "#FF0000",
"selected_highlight": "#FFD700",
"sidebar_border": "#EEEEEE",
"wrapper_background": "#FFFFFF"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "91px",
"width": "251px"
},
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 4,
"toc_cell": false,
"toc_position": {
"height": "553px",
"left": "0px",
"right": "1064px",
"top": "149px",
"width": "312px"
},
"toc_section_display": "block",
"toc_window_display": true,
"widenNotebook": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}