mirror of
https://github.com/wassname/openai-transformer-lm-gutenberg-erotic.git
synced 2026-06-26 16:00:39 +08:00
781 lines
44 KiB
Plaintext
781 lines
44 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T02:29:54.022099Z",
|
||
"start_time": "2018-11-04T02:29:54.017857Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"# Fetch guternberg books from a category"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Step 1, get book ids\n",
|
||
"\n",
|
||
"- go to http://m.gutenberg.org/ebooks/search.mobile/?query=Erotic+%21+bsxErotic&sort_order=downloads\n",
|
||
"\n",
|
||
"- scroll to the bottom and click \"show more\" a few times\n",
|
||
"- enter the javascript below in the browsers js console\n",
|
||
"- it should have copied the ids to your clipboard, you can paste it into \"ids\" below\n",
|
||
"\n",
|
||
"\n",
|
||
"```js\n",
|
||
"// to get all book ids shown on page, paste this javascript into js console in browser when on the page above\n",
|
||
"a_elems = document.getElementsByClassName(\"table link\")\n",
|
||
"hrefs = Array.from(a_elems)\n",
|
||
" .map(e=>e.href) // get link\n",
|
||
" .filter(e=>e) // remove empty links\n",
|
||
"ids = hrefs.map(e=>/(\\d+)\\.mobile/.exec(e)) // regular expression match\n",
|
||
" .filter(e=>e) // remove ones not found\n",
|
||
" .map(e=>e[1]) // get just id\n",
|
||
"copy(ids) // copy to clipboard\n",
|
||
"```"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:54:51.977129Z",
|
||
"start_time": "2018-11-04T08:54:51.641106Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import requests\n",
|
||
"import os\n",
|
||
"import re\n",
|
||
"import bs4\n",
|
||
"import time\n",
|
||
"import json\n",
|
||
"from tqdm import tqdm_notebook as tqdm\n",
|
||
"\n",
|
||
"dest_dir = 'data/corpus/erotic_gutenberg'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:54:52.021478Z",
|
||
"start_time": "2018-11-04T08:54:51.980984Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# urls to download text inputs\n",
|
||
"ids = [\n",
|
||
" \"30254\",\n",
|
||
" \"30360\",\n",
|
||
" \"28520\",\n",
|
||
" \"25305\",\n",
|
||
" \"14005\",\n",
|
||
" \"28522\",\n",
|
||
" \"31284\",\n",
|
||
" \"28521\",\n",
|
||
" \"29827\",\n",
|
||
" \"52059\",\n",
|
||
" \"14323\",\n",
|
||
" \"13610\",\n",
|
||
" \"57284\",\n",
|
||
" \"13972\",\n",
|
||
" \"52205\",\n",
|
||
" \"54672\",\n",
|
||
" \"13614\",\n",
|
||
" \"28718\",\n",
|
||
" \"44877\",\n",
|
||
" \"26804\",\n",
|
||
" \"45150\",\n",
|
||
" \"37491\",\n",
|
||
" \"43438\",\n",
|
||
" \"48943\",\n",
|
||
" \"53807\",\n",
|
||
" \"26456\",\n",
|
||
" \"26808\",\n",
|
||
" \"13971\",\n",
|
||
" \"42406\",\n",
|
||
" \"43823\",\n",
|
||
" \"39220\",\n",
|
||
" \"56779\",\n",
|
||
" \"26809\",\n",
|
||
" \"18610\",\n",
|
||
" \"44181\",\n",
|
||
" \"42212\",\n",
|
||
" \"26806\",\n",
|
||
" \"42586\",\n",
|
||
" \"47892\",\n",
|
||
" \"43822\",\n",
|
||
" \"49855\",\n",
|
||
" \"26562\",\n",
|
||
" \"26739\",\n",
|
||
" \"26807\",\n",
|
||
" \"20568\",\n",
|
||
" \"40877\",\n",
|
||
" \"54419\",\n",
|
||
" \"53944\",\n",
|
||
" \"40557\",\n",
|
||
" \"29049\",\n",
|
||
" \"25543\",\n",
|
||
" \"40902\",\n",
|
||
" \"41301\",\n",
|
||
" \"56491\",\n",
|
||
" \"28789\",\n",
|
||
" \"40496\"\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:54:52.038785Z",
|
||
"start_time": "2018-11-04T08:54:52.024258Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# from https://github.com/motoom/gutenberg-ebook-scraping/blob/master/gutenberg.py\n",
|
||
"\n",
|
||
"# Repetitive stuff I don't want to read a 1000 times on my eBook reader.\n",
|
||
"remove = [\"Produced by\",\"End of the Project Gutenberg\",\"End of Project Gutenberg\"]\n",
|
||
"\n",
|
||
"def beautify(text):\n",
|
||
" ''' Reads a raw Project Gutenberg etext, reformat paragraphs,\n",
|
||
" and removes fluff. Determines the title of the book'''\n",
|
||
" lines = [line.strip() for line in text.split('\\n')]\n",
|
||
" collect = False\n",
|
||
" lookforsubtitle = False\n",
|
||
" outlines = []\n",
|
||
" startseen = endseen = False\n",
|
||
" title=\"\"\n",
|
||
" author=\"\"\n",
|
||
" language=\"\"\n",
|
||
" extra=[]\n",
|
||
" for line in lines:\n",
|
||
" if line.startswith(\"Author: \"):\n",
|
||
" author = line[8:]\n",
|
||
" if line.startswith(\"Language: \"):\n",
|
||
" language = line[10:]\n",
|
||
" if line.startswith(\"Title: \"):\n",
|
||
" title = line[7:]\n",
|
||
" lookforsubtitle = True\n",
|
||
" continue\n",
|
||
" if lookforsubtitle:\n",
|
||
" if not line.strip():\n",
|
||
" lookforsubtitle = False\n",
|
||
" else:\n",
|
||
" subtitle = line.strip()\n",
|
||
" subtitle = subtitle.strip(\".\")\n",
|
||
" title += \", \" + subtitle\n",
|
||
" if (\"*** START\" in line) or (\"***START\" in line) or (line.startswith(\"*END THE SMALL PRINT!\")):\n",
|
||
" collect = startseen = True\n",
|
||
" paragraph = \"\"\n",
|
||
" extra.append(line)\n",
|
||
" continue\n",
|
||
" if (\"*** END\" in line) or (\"***END\" in line):\n",
|
||
" endseen = True\n",
|
||
" extra.append(line)\n",
|
||
" break\n",
|
||
" if not collect:\n",
|
||
" extra.append(line)\n",
|
||
" continue\n",
|
||
" if not line:\n",
|
||
" paragraph = paragraph.strip()\n",
|
||
" for term in remove:\n",
|
||
" if paragraph.startswith(term):\n",
|
||
" extra.append(line)\n",
|
||
" paragraph = \"\"\n",
|
||
" break\n",
|
||
" if paragraph:\n",
|
||
" outlines.append(paragraph)\n",
|
||
" outlines.append(\"\")\n",
|
||
" paragraph = \"\"\n",
|
||
" else:\n",
|
||
" paragraph += \" \" + line\n",
|
||
"\n",
|
||
" # Report on anomalous situations, but don't make it a showstopper.\n",
|
||
" if not title:\n",
|
||
"# print (ofn)\n",
|
||
" print (\" Problem: No title found\\n\")\n",
|
||
" if not startseen:\n",
|
||
"# print (ofn)\n",
|
||
" print (\" Problem: No '*** START' seen\\n\")\n",
|
||
" if not endseen:\n",
|
||
"# print (ofn)\n",
|
||
" print (\" Problem: No '*** END' seen\\n\")\n",
|
||
" \n",
|
||
" return dict(\n",
|
||
" content='\\n'.join(outlines),\n",
|
||
" title=title,\n",
|
||
" author=author,\n",
|
||
" language=language,\n",
|
||
" extra=extra\n",
|
||
" )\n",
|
||
" \n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:55:45.417570Z",
|
||
"start_time": "2018-11-04T08:54:52.040977Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "2e1c657a58ac42f9bb99cc8221138bee",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"HBox(children=(IntProgress(value=0, max=56), HTML(value='')))"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Problem: No title found\n",
|
||
"\n",
|
||
" Problem: No '*** START' seen\n",
|
||
"\n",
|
||
" Problem: No '*** END' seen\n",
|
||
"\n",
|
||
" Problem: No title found\n",
|
||
"\n",
|
||
" Problem: No '*** START' seen\n",
|
||
"\n",
|
||
" Problem: No '*** END' seen\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "KeyboardInterrupt",
|
||
"evalue": "",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||
"\u001b[0;32m<ipython-input-4-88f9382543da>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# first download index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mindex_url\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"http://www.gutenberg.org/files/{bid:}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_for_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbs4\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"html5lib\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 506\u001b[0m }\n\u001b[1;32m 507\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m )\n\u001b[1;32m 442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;31m# Reset the timeout for the recv() on the socket\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1107\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_set_content_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;31m# default charset of iso-8859-1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1102\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1103\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 875\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 877\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 878\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotConnected\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 141\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mSocketTimeout\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msource_address\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for bid in tqdm(ids):\n",
|
||
" \n",
|
||
" # first download index\n",
|
||
" index_url = \"http://www.gutenberg.org/files/{bid:}\".format(bid=bid)\n",
|
||
" r = requests.get(index_url)\n",
|
||
" r.raise_for_status()\n",
|
||
" soup = bs4.BeautifulSoup(r.content, \"html5lib\")\n",
|
||
" hrefs = [e.attrs['href'] for e in soup.findAll('a')]\n",
|
||
" links = [h for h in hrefs if h.endswith('.txt')]\n",
|
||
" \n",
|
||
" # download text\n",
|
||
" for link in links:\n",
|
||
" txt_url = index_url + '/' + link\n",
|
||
" outfile = os.path.join(dest_dir, link.replace('.txt', '.json'))\n",
|
||
" if not os.path.isfile(outfile):\n",
|
||
" r = requests.get(txt_url)\n",
|
||
" r.raise_for_status()\n",
|
||
" info = beautify(r.text)\n",
|
||
" if (info['language'] == 'English') and len(info['language']):\n",
|
||
" # TODO some are empty, check before saving\n",
|
||
" json.dump(info, open(outfile, 'w'))\n",
|
||
" \n",
|
||
" time.sleep(0.5) # avoid ddos/ban"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T02:37:32.490814Z",
|
||
"start_time": "2018-11-04T02:37:32.485221Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 2. turn into csv, like rocstories"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:55:48.451860Z",
|
||
"start_time": "2018-11-04T08:55:46.818680Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>author</th>\n",
|
||
" <th>content</th>\n",
|
||
" <th>extra</th>\n",
|
||
" <th>language</th>\n",
|
||
" <th>title</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Albert Mordell</td>\n",
|
||
" <td>THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ...</td>\n",
|
||
" <td>[Project Gutenberg's The Erotic Motive in Lit...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Erotic Motive in Literature</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Anonymous</td>\n",
|
||
" <td>[Transcriber's note: Anonymous, _Laura Middlet...</td>\n",
|
||
" <td>[Project Gutenberg's Laura Middleton; Her Brot...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Laura Middleton; Her Brother and her Lover</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>J. K. Huysmans</td>\n",
|
||
" <td>LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of Là-bas, by J. ...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Là-bas</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>John Cleland</td>\n",
|
||
" <td>MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of Memoirs Of Fan...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Memoirs Of Fanny Hill, A New and Genuine Editi...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Havelock Ellis</td>\n",
|
||
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Anonymous</td>\n",
|
||
" <td>[Transcriber's note: Anonymous, _Forbidden fru...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of Forbidden Frui...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Forbidden Fruit, Luscious and exciting story a...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Kate Percival</td>\n",
|
||
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Various</td>\n",
|
||
" <td>[Transcriber's Note: The following was proofre...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Fifteen Comforts of Matrimony: Responses F...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Anonymous</td>\n",
|
||
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Denis Diderot</td>\n",
|
||
" <td>_Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of Les Bijoux Ind...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Les Bijoux Indiscrets, or, The Indiscreet Toys</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Anonymous</td>\n",
|
||
" <td>THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of The Ladies Del...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Ladies Delight</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Havelock Ellis</td>\n",
|
||
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>J. K. Huysmans</td>\n",
|
||
" <td>LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of La-bas, by J. ...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>La-bas</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Havelock Ellis</td>\n",
|
||
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Friedrich Karl Forberg</td>\n",
|
||
" <td>MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of Manual of Cla...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Manual of Classical Erotology (De figuris Vene...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Anonymous</td>\n",
|
||
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>Georg Brandes</td>\n",
|
||
" <td>MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of Main Currents...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Main Currents in Nineteenth Century Literature...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Anonymous</td>\n",
|
||
" <td>The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic...</td>\n",
|
||
" <td>[, The Project Gutenberg EBook of The Romance ...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Romance of Lust A classic Victorian erotic...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>L. Brovan</td>\n",
|
||
" <td>Two Hundred and fifty Copies of this Work have...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of Anthologica R...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Anthologica Rarissima: The Way of a Virgin</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Various</td>\n",
|
||
" <td>[Transcriber's Note: The following was proofre...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Fifteen Comforts of Matrimony: Responses f...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Kate Percival</td>\n",
|
||
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
|
||
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>Havelock Ellis</td>\n",
|
||
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||
" <td>English</td>\n",
|
||
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" author content \\\n",
|
||
"0 Albert Mordell THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ... \n",
|
||
"1 Anonymous [Transcriber's note: Anonymous, _Laura Middlet... \n",
|
||
"2 J. K. Huysmans LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
|
||
"3 John Cleland MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A... \n",
|
||
"4 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||
"5 Anonymous [Transcriber's note: Anonymous, _Forbidden fru... \n",
|
||
"6 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
|
||
"7 Various [Transcriber's Note: The following was proofre... \n",
|
||
"8 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
|
||
"9 Denis Diderot _Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr... \n",
|
||
"10 Anonymous THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad... \n",
|
||
"11 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||
"12 J. K. Huysmans LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
|
||
"13 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||
"14 Friedrich Karl Forberg MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig... \n",
|
||
"15 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
|
||
"16 Georg Brandes MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n... \n",
|
||
"17 Anonymous The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic... \n",
|
||
"18 L. Brovan Two Hundred and fifty Copies of this Work have... \n",
|
||
"19 Various [Transcriber's Note: The following was proofre... \n",
|
||
"20 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
|
||
"21 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||
"\n",
|
||
" extra language \\\n",
|
||
"0 [Project Gutenberg's The Erotic Motive in Lit... English \n",
|
||
"1 [Project Gutenberg's Laura Middleton; Her Brot... English \n",
|
||
"2 [The Project Gutenberg EBook of Là-bas, by J. ... English \n",
|
||
"3 [The Project Gutenberg EBook of Memoirs Of Fan... English \n",
|
||
"4 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||
"5 [The Project Gutenberg EBook of Forbidden Frui... English \n",
|
||
"6 [The Project Gutenberg EBook of The Life and A... English \n",
|
||
"7 [The Project Gutenberg EBook of The Fifteen Co... English \n",
|
||
"8 [The Project Gutenberg EBook of The Power of M... English \n",
|
||
"9 [The Project Gutenberg EBook of Les Bijoux Ind... English \n",
|
||
"10 [The Project Gutenberg EBook of The Ladies Del... English \n",
|
||
"11 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||
"12 [The Project Gutenberg EBook of La-bas, by J. ... English \n",
|
||
"13 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||
"14 [The Project Gutenberg EBook of Manual of Cla... English \n",
|
||
"15 [The Project Gutenberg EBook of The Power of M... English \n",
|
||
"16 [The Project Gutenberg EBook of Main Currents... English \n",
|
||
"17 [, The Project Gutenberg EBook of The Romance ... English \n",
|
||
"18 [The Project Gutenberg EBook of Anthologica R... English \n",
|
||
"19 [The Project Gutenberg EBook of The Fifteen Co... English \n",
|
||
"20 [The Project Gutenberg EBook of The Life and A... English \n",
|
||
"21 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||
"\n",
|
||
" title \n",
|
||
"0 The Erotic Motive in Literature \n",
|
||
"1 Laura Middleton; Her Brother and her Lover \n",
|
||
"2 Là-bas \n",
|
||
"3 Memoirs Of Fanny Hill, A New and Genuine Editi... \n",
|
||
"4 Studies in the Psychology of Sex, Volume 1 (of 6) \n",
|
||
"5 Forbidden Fruit, Luscious and exciting story a... \n",
|
||
"6 The Life and Amours of the Beautiful, Gay and ... \n",
|
||
"7 The Fifteen Comforts of Matrimony: Responses F... \n",
|
||
"8 The Power of Mesmerism, A Highly Erotic Narrat... \n",
|
||
"9 Les Bijoux Indiscrets, or, The Indiscreet Toys \n",
|
||
"10 The Ladies Delight \n",
|
||
"11 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
|
||
"12 La-bas \n",
|
||
"13 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
|
||
"14 Manual of Classical Erotology (De figuris Vene... \n",
|
||
"15 The Power of Mesmerism, A Highly Erotic Narrat... \n",
|
||
"16 Main Currents in Nineteenth Century Literature... \n",
|
||
"17 The Romance of Lust A classic Victorian erotic... \n",
|
||
"18 Anthologica Rarissima: The Way of a Virgin \n",
|
||
"19 The Fifteen Comforts of Matrimony: Responses f... \n",
|
||
"20 The Life and Amours of the Beautiful, Gay and ... \n",
|
||
"21 Studies in the Psychology of Sex, Volume 1 (of 6) "
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import uuid\n",
|
||
"import pandas as pd\n",
|
||
"import nltk\n",
|
||
"# nltk.download('punkt')\n",
|
||
"\n",
|
||
"dest_dir = 'data/corpus/erotic_gutenberg'\n",
|
||
"max_len = 400\n",
|
||
"num_sent = 6\n",
|
||
"data=[]\n",
|
||
"for infile in os.listdir(dest_dir):\n",
|
||
" path = os.path.join(dest_dir, infile)\n",
|
||
" info = json.load(open(path))\n",
|
||
" data.append(info)\n",
|
||
"df = pd.DataFrame(data)\n",
|
||
"df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:55:59.624853Z",
|
||
"start_time": "2018-11-04T08:55:59.621367Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# # Test: concat small sentances\n",
|
||
"# # And split large ones\n",
|
||
"# sent = []\n",
|
||
"# for s in sentances:\n",
|
||
"# if len(s)>10:\n",
|
||
"# sent.append(s)\n",
|
||
"# else:\n",
|
||
"# sent[-1]+=' '+s\n",
|
||
"# sent"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:56:56.969876Z",
|
||
"start_time": "2018-11-04T08:56:56.965223Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = df.rename(columns=dict(content='TEXT'))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2018-11-04T08:56:58.359405Z",
|
||
"start_time": "2018-11-04T08:56:58.068470Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"df.to_csv('data/erotic_gutenberg_dataset.csv', index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "jupyter3",
|
||
"language": "python",
|
||
"name": "jupyter3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.5.3"
|
||
},
|
||
"toc": {
|
||
"colors": {
|
||
"hover_highlight": "#DAA520",
|
||
"navigate_num": "#000000",
|
||
"navigate_text": "#333333",
|
||
"running_highlight": "#FF0000",
|
||
"selected_highlight": "#FFD700",
|
||
"sidebar_border": "#EEEEEE",
|
||
"wrapper_background": "#FFFFFF"
|
||
},
|
||
"moveMenuLeft": true,
|
||
"nav_menu": {
|
||
"height": "91px",
|
||
"width": "251px"
|
||
},
|
||
"navigate_menu": true,
|
||
"number_sections": true,
|
||
"sideBar": true,
|
||
"threshold": 4,
|
||
"toc_cell": false,
|
||
"toc_position": {
|
||
"height": "553px",
|
||
"left": "0px",
|
||
"right": "1064px",
|
||
"top": "149px",
|
||
"width": "312px"
|
||
},
|
||
"toc_section_display": "block",
|
||
"toc_window_display": true,
|
||
"widenNotebook": false
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|