mirror of
https://github.com/wassname/openai-transformer-lm-gutenberg-erotic.git
synced 2026-06-27 16:10:19 +08:00
using language model from horoscope_language_model
https://github.com/rodgzilla/pytorch-openai-transformer-lm/blob/horoscope_language_model
This commit is contained in:
+2
-2
@@ -2,7 +2,7 @@ import os
|
||||
import csv
|
||||
import numpy as np
|
||||
|
||||
from tqdm import tqdm
|
||||
from tqdm import tqdm_notebook as tqdm
|
||||
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.model_selection import train_test_split
|
||||
@@ -16,7 +16,7 @@ def _rocstories(path):
|
||||
ct1 = []
|
||||
ct2 = []
|
||||
y = []
|
||||
for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
|
||||
for i, line in enumerate(tqdm(list(f), ncols=80, mininterval=10, leave=False)):
|
||||
if i > 0:
|
||||
s = ' '.join(line[1:5]) # 4 sentances
|
||||
st.append(s)
|
||||
|
||||
+376
-103
@@ -40,11 +40,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:17:20.517091Z",
|
||||
"start_time": "2018-11-04T05:17:20.141438Z"
|
||||
"end_time": "2018-11-04T08:54:51.977129Z",
|
||||
"start_time": "2018-11-04T08:54:51.641106Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -62,11 +62,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:17:20.567180Z",
|
||||
"start_time": "2018-11-04T05:17:20.519956Z"
|
||||
"end_time": "2018-11-04T08:54:52.021478Z",
|
||||
"start_time": "2018-11-04T08:54:51.980984Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -134,11 +134,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:17:09.263186Z",
|
||||
"start_time": "2018-11-04T05:17:09.246892Z"
|
||||
"end_time": "2018-11-04T08:54:52.038785Z",
|
||||
"start_time": "2018-11-04T08:54:52.024258Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -225,14 +225,73 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T02:41:00.594210Z",
|
||||
"start_time": "2018-11-04T02:37:49.143793Z"
|
||||
"end_time": "2018-11-04T08:55:45.417570Z",
|
||||
"start_time": "2018-11-04T08:54:52.040977Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "2e1c657a58ac42f9bb99cc8221138bee",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(IntProgress(value=0, max=56), HTML(value='')))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Problem: No title found\n",
|
||||
"\n",
|
||||
" Problem: No '*** START' seen\n",
|
||||
"\n",
|
||||
" Problem: No '*** END' seen\n",
|
||||
"\n",
|
||||
" Problem: No title found\n",
|
||||
"\n",
|
||||
" Problem: No '*** START' seen\n",
|
||||
"\n",
|
||||
" Problem: No '*** END' seen\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-4-88f9382543da>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# first download index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mindex_url\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"http://www.gutenberg.org/files/{bid:}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_for_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbs4\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"html5lib\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 506\u001b[0m }\n\u001b[1;32m 507\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m )\n\u001b[1;32m 442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;31m# Reset the timeout for the recv() on the socket\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1107\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_set_content_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;31m# default charset of iso-8859-1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1102\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1103\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 875\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 877\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 878\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotConnected\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 141\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mSocketTimeout\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msource_address\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for bid in tqdm(ids):\n",
|
||||
" \n",
|
||||
@@ -294,14 +353,302 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:19:03.832336Z",
|
||||
"start_time": "2018-11-04T05:18:59.180170Z"
|
||||
"end_time": "2018-11-04T08:55:48.451860Z",
|
||||
"start_time": "2018-11-04T08:55:46.818680Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>author</th>\n",
|
||||
" <th>content</th>\n",
|
||||
" <th>extra</th>\n",
|
||||
" <th>language</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>Albert Mordell</td>\n",
|
||||
" <td>THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ...</td>\n",
|
||||
" <td>[Project Gutenberg's The Erotic Motive in Lit...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Erotic Motive in Literature</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>Anonymous</td>\n",
|
||||
" <td>[Transcriber's note: Anonymous, _Laura Middlet...</td>\n",
|
||||
" <td>[Project Gutenberg's Laura Middleton; Her Brot...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Laura Middleton; Her Brother and her Lover</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>J. K. Huysmans</td>\n",
|
||||
" <td>LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of Là-bas, by J. ...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Là-bas</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>John Cleland</td>\n",
|
||||
" <td>MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of Memoirs Of Fan...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Memoirs Of Fanny Hill, A New and Genuine Editi...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>Havelock Ellis</td>\n",
|
||||
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>Anonymous</td>\n",
|
||||
" <td>[Transcriber's note: Anonymous, _Forbidden fru...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of Forbidden Frui...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Forbidden Fruit, Luscious and exciting story a...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>Kate Percival</td>\n",
|
||||
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>Various</td>\n",
|
||||
" <td>[Transcriber's Note: The following was proofre...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Fifteen Comforts of Matrimony: Responses F...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>Anonymous</td>\n",
|
||||
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>Denis Diderot</td>\n",
|
||||
" <td>_Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of Les Bijoux Ind...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Les Bijoux Indiscrets, or, The Indiscreet Toys</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>Anonymous</td>\n",
|
||||
" <td>THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of The Ladies Del...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Ladies Delight</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>Havelock Ellis</td>\n",
|
||||
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>J. K. Huysmans</td>\n",
|
||||
" <td>LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of La-bas, by J. ...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>La-bas</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>Havelock Ellis</td>\n",
|
||||
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>Friedrich Karl Forberg</td>\n",
|
||||
" <td>MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of Manual of Cla...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Manual of Classical Erotology (De figuris Vene...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>Anonymous</td>\n",
|
||||
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>Georg Brandes</td>\n",
|
||||
" <td>MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of Main Currents...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Main Currents in Nineteenth Century Literature...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>Anonymous</td>\n",
|
||||
" <td>The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic...</td>\n",
|
||||
" <td>[, The Project Gutenberg EBook of The Romance ...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Romance of Lust A classic Victorian erotic...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>L. Brovan</td>\n",
|
||||
" <td>Two Hundred and fifty Copies of this Work have...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of Anthologica R...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Anthologica Rarissima: The Way of a Virgin</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>Various</td>\n",
|
||||
" <td>[Transcriber's Note: The following was proofre...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Fifteen Comforts of Matrimony: Responses f...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20</th>\n",
|
||||
" <td>Kate Percival</td>\n",
|
||||
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
|
||||
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21</th>\n",
|
||||
" <td>Havelock Ellis</td>\n",
|
||||
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
|
||||
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
|
||||
" <td>English</td>\n",
|
||||
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" author content \\\n",
|
||||
"0 Albert Mordell THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ... \n",
|
||||
"1 Anonymous [Transcriber's note: Anonymous, _Laura Middlet... \n",
|
||||
"2 J. K. Huysmans LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
|
||||
"3 John Cleland MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A... \n",
|
||||
"4 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||||
"5 Anonymous [Transcriber's note: Anonymous, _Forbidden fru... \n",
|
||||
"6 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
|
||||
"7 Various [Transcriber's Note: The following was proofre... \n",
|
||||
"8 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
|
||||
"9 Denis Diderot _Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr... \n",
|
||||
"10 Anonymous THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad... \n",
|
||||
"11 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||||
"12 J. K. Huysmans LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
|
||||
"13 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||||
"14 Friedrich Karl Forberg MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig... \n",
|
||||
"15 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
|
||||
"16 Georg Brandes MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n... \n",
|
||||
"17 Anonymous The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic... \n",
|
||||
"18 L. Brovan Two Hundred and fifty Copies of this Work have... \n",
|
||||
"19 Various [Transcriber's Note: The following was proofre... \n",
|
||||
"20 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
|
||||
"21 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
|
||||
"\n",
|
||||
" extra language \\\n",
|
||||
"0 [Project Gutenberg's The Erotic Motive in Lit... English \n",
|
||||
"1 [Project Gutenberg's Laura Middleton; Her Brot... English \n",
|
||||
"2 [The Project Gutenberg EBook of Là-bas, by J. ... English \n",
|
||||
"3 [The Project Gutenberg EBook of Memoirs Of Fan... English \n",
|
||||
"4 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||||
"5 [The Project Gutenberg EBook of Forbidden Frui... English \n",
|
||||
"6 [The Project Gutenberg EBook of The Life and A... English \n",
|
||||
"7 [The Project Gutenberg EBook of The Fifteen Co... English \n",
|
||||
"8 [The Project Gutenberg EBook of The Power of M... English \n",
|
||||
"9 [The Project Gutenberg EBook of Les Bijoux Ind... English \n",
|
||||
"10 [The Project Gutenberg EBook of The Ladies Del... English \n",
|
||||
"11 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||||
"12 [The Project Gutenberg EBook of La-bas, by J. ... English \n",
|
||||
"13 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||||
"14 [The Project Gutenberg EBook of Manual of Cla... English \n",
|
||||
"15 [The Project Gutenberg EBook of The Power of M... English \n",
|
||||
"16 [The Project Gutenberg EBook of Main Currents... English \n",
|
||||
"17 [, The Project Gutenberg EBook of The Romance ... English \n",
|
||||
"18 [The Project Gutenberg EBook of Anthologica R... English \n",
|
||||
"19 [The Project Gutenberg EBook of The Fifteen Co... English \n",
|
||||
"20 [The Project Gutenberg EBook of The Life and A... English \n",
|
||||
"21 [The Project Gutenberg eBook, Studies in the P... English \n",
|
||||
"\n",
|
||||
" title \n",
|
||||
"0 The Erotic Motive in Literature \n",
|
||||
"1 Laura Middleton; Her Brother and her Lover \n",
|
||||
"2 Là-bas \n",
|
||||
"3 Memoirs Of Fanny Hill, A New and Genuine Editi... \n",
|
||||
"4 Studies in the Psychology of Sex, Volume 1 (of 6) \n",
|
||||
"5 Forbidden Fruit, Luscious and exciting story a... \n",
|
||||
"6 The Life and Amours of the Beautiful, Gay and ... \n",
|
||||
"7 The Fifteen Comforts of Matrimony: Responses F... \n",
|
||||
"8 The Power of Mesmerism, A Highly Erotic Narrat... \n",
|
||||
"9 Les Bijoux Indiscrets, or, The Indiscreet Toys \n",
|
||||
"10 The Ladies Delight \n",
|
||||
"11 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
|
||||
"12 La-bas \n",
|
||||
"13 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
|
||||
"14 Manual of Classical Erotology (De figuris Vene... \n",
|
||||
"15 The Power of Mesmerism, A Highly Erotic Narrat... \n",
|
||||
"16 Main Currents in Nineteenth Century Literature... \n",
|
||||
"17 The Romance of Lust A classic Victorian erotic... \n",
|
||||
"18 Anthologica Rarissima: The Way of a Virgin \n",
|
||||
"19 The Fifteen Comforts of Matrimony: Responses f... \n",
|
||||
"20 The Life and Amours of the Beautiful, Gay and ... \n",
|
||||
"21 Studies in the Psychology of Sex, Volume 1 (of 6) "
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import uuid\n",
|
||||
"import pandas as pd\n",
|
||||
@@ -315,33 +662,18 @@
|
||||
"for infile in os.listdir(dest_dir):\n",
|
||||
" path = os.path.join(dest_dir, infile)\n",
|
||||
" info = json.load(open(path))\n",
|
||||
" paragraphs = info['content'].split('\\n\\n')\n",
|
||||
" for paragraph in paragraphs:\n",
|
||||
"# sentances = [p for p in paragraph.strip().split('. ')]\n",
|
||||
" sentances = nltk.sent_tokenize(paragraph)\n",
|
||||
" if len(sentances)>num_sent:\n",
|
||||
" for i in range(len(sentances)//num_sent):\n",
|
||||
" data.append(dict(\n",
|
||||
" storyid=uuid.uuid4().hex,\n",
|
||||
" sentence1=sentances[i*5+0][:max_len],\n",
|
||||
" sentence2=sentances[i*5+1][:max_len],\n",
|
||||
" sentence3=sentances[i*5+2][:max_len],\n",
|
||||
" sentence4=sentances[i*5+3][:max_len],\n",
|
||||
" sentence5=sentances[i*5+4][:max_len],\n",
|
||||
" AnswerRightEnding=1\n",
|
||||
" ))\n",
|
||||
" data.append(info)\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"df = df[['storyid', 'sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5', 'AnswerRightEnding']]\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:19:03.839834Z",
|
||||
"start_time": "2018-11-04T05:19:03.835656Z"
|
||||
"end_time": "2018-11-04T08:55:59.624853Z",
|
||||
"start_time": "2018-11-04T08:55:59.621367Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -359,91 +691,32 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:19:09.318413Z",
|
||||
"start_time": "2018-11-04T05:19:08.980242Z"
|
||||
"end_time": "2018-11-04T08:56:56.969876Z",
|
||||
"start_time": "2018-11-04T08:56:56.965223Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"df['sentence1'].str.len().plot.hist(bins=55)\n",
|
||||
"df['sentence1'].str.len().max()"
|
||||
"df = df.rename(columns=dict(content='TEXT'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:19:14.924306Z",
|
||||
"start_time": "2018-11-04T05:19:14.917985Z"
|
||||
"end_time": "2018-11-04T08:56:58.359405Z",
|
||||
"start_time": "2018-11-04T08:56:58.068470Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"val_idx = int(len(df)*0.7)\n",
|
||||
"df_train = df[:val_idx]\n",
|
||||
"df_val = df[val_idx:]"
|
||||
"df.to_csv('data/erotic_gutenberg_dataset.csv', index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:19:15.503886Z",
|
||||
"start_time": "2018-11-04T05:19:15.384466Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_train.to_csv('data/erotic_gutenberg_TRAIN.csv', index=False)\n",
|
||||
"df_val.to_csv('data/erotic_gutenberg_VAL.csv', index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-04T05:19:16.452211Z",
|
||||
"start_time": "2018-11-04T05:19:16.281536Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import csv\n",
|
||||
"# def _rocstories(path):\n",
|
||||
"# with open(path, encoding='utf_8') as f:\n",
|
||||
"# f = csv.reader(f)\n",
|
||||
"# st = []\n",
|
||||
"# ct1 = []\n",
|
||||
"# ct2 = []\n",
|
||||
"# y = []\n",
|
||||
"# for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):\n",
|
||||
"# if i > 0:\n",
|
||||
"# s = ' '.join(line[1:5])\n",
|
||||
"# c1 = line[5]\n",
|
||||
"# c2 = line[6]\n",
|
||||
"# st.append(s)\n",
|
||||
"# ct1.append(c1)\n",
|
||||
"# ct2.append(c2)\n",
|
||||
"# y.append(int(line[-1])-1)\n",
|
||||
"# return st, ct1, ct2, y\n",
|
||||
" \n",
|
||||
"# _rocstories('data/erotic_gutenberg_TRAIN.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -1,32 +1,5 @@
|
||||
import torch
|
||||
|
||||
class LMLossCompute:
|
||||
"A Loss compute and train function for multiple choice tasks."
|
||||
|
||||
def __init__(self, lm_criterion, opt=None):
|
||||
self.lm_criterion = lm_criterion
|
||||
self.opt = opt
|
||||
|
||||
def __call__(self, X, Y, M, lm_logits, only_return_losses=False):
|
||||
# Language modeling loss
|
||||
if lm_logits is not None:
|
||||
x_shifted = X[:, :, 1:, 0].contiguous().view(-1) # Shape: 252
|
||||
M = M.view(-1, M.size(2))
|
||||
lm_losses = self.lm_criterion(lm_logits, x_shifted)
|
||||
lm_losses = lm_losses.view(X.size(0) * X.size(1), X.size(2) - 1)
|
||||
lm_losses = lm_losses * M[:, 1:]
|
||||
lm_losses = lm_losses.sum(1) / torch.sum(M[:, 1:], 1)
|
||||
if only_return_losses:
|
||||
return lm_losses
|
||||
|
||||
train_loss = lm_losses.sum()
|
||||
train_loss.backward()
|
||||
if self.opt is not None:
|
||||
self.opt.step()
|
||||
self.opt.zero_grad()
|
||||
return train_loss.item()
|
||||
|
||||
|
||||
class MultipleChoiceLossCompute:
|
||||
"A Loss compute and train function for multiple choice tasks."
|
||||
|
||||
@@ -93,4 +66,29 @@ class ClassificationLossCompute:
|
||||
self.opt.zero_grad()
|
||||
return train_loss.item()
|
||||
|
||||
class LanguageModelingLossCompute:
|
||||
" A Loss compute and train function for language modeling tasks."
|
||||
def __init__(self, lm_criterion, opt=None):
|
||||
self.lm_criterion = lm_criterion
|
||||
self.opt = opt
|
||||
|
||||
# def __call__(self, X, Y, M, clf_logits, lm_logits=None, only_return_losses=False):
|
||||
def __call__(self, X, Y, M, lm_logits, only_return_losses=False):
|
||||
# Language modeling loss
|
||||
x_shifted = X[:, 1:, 0].contiguous().view(-1)
|
||||
M = M.view(-1, M.size(-1))
|
||||
lm_losses = self.lm_criterion(lm_logits, x_shifted)
|
||||
lm_losses = lm_losses.view(X.size(0), X.size(-2) - 1)
|
||||
lm_losses = lm_losses * M[:, 1:]
|
||||
lm_losses = lm_losses.sum(1) / torch.sum(M[:, 1:], 1)
|
||||
if only_return_losses:
|
||||
return lm_losses
|
||||
|
||||
train_loss = lm_losses.sum()
|
||||
train_loss.backward()
|
||||
if self.opt is not None:
|
||||
self.opt.step()
|
||||
self.opt.zero_grad()
|
||||
return train_loss.item()
|
||||
|
||||
# TODO Implement a LossCompute class for similiraty tasks.
|
||||
|
||||
+14
-2
@@ -279,8 +279,8 @@ class DoubleHeadModel(nn.Module):
|
||||
# the three classes correspond to entailment, contradiction and neutral.
|
||||
self.task_head = ClfHead(clf_token, cfg, 3)
|
||||
else:
|
||||
raise ValueError("task_head_type is expected to be 'multiple_choice' "+
|
||||
"'similarity', 'inference' or ('classification', n_class) "+
|
||||
raise ValueError("task_head_type is expected to be 'multiple_choice' "
|
||||
"'similarity', 'inference' or ('classification', n_class) "
|
||||
"got {task_head_type}.".format(task_head_type=task_head_type))
|
||||
elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \
|
||||
task_head_type[0] == 'classification':
|
||||
@@ -298,6 +298,18 @@ class DoubleHeadModel(nn.Module):
|
||||
|
||||
return lm_logits, task_logits
|
||||
|
||||
class LanguageModel(nn.Module):
|
||||
""" Transformer with language model """
|
||||
def __init__(self, cfg, vocab=40990, n_ctx=512):
|
||||
super(LanguageModel, self).__init__()
|
||||
self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
|
||||
self.lm_head = LMHead(self.transformer, cfg)
|
||||
|
||||
def forward(self, x):
|
||||
h = self.transformer(x)
|
||||
lm_logits = self.lm_head(h)
|
||||
|
||||
return lm_logits
|
||||
|
||||
def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/',
|
||||
path_names='./'):
|
||||
|
||||
+2
-2
@@ -3,7 +3,7 @@ import ftfy
|
||||
import json
|
||||
import spacy
|
||||
|
||||
from tqdm import tqdm
|
||||
from tqdm import tqdm_notebook as tqdm
|
||||
|
||||
def get_pairs(word):
|
||||
"""
|
||||
@@ -92,7 +92,7 @@ class TextEncoder(object):
|
||||
def encode(self, texts, verbose=True):
|
||||
texts_tokens = []
|
||||
if verbose:
|
||||
for text in tqdm(texts, ncols=80, leave=False):
|
||||
for text in tqdm(texts, ncols=80, mininterval=10, leave=False):
|
||||
text = self.nlp(text_standardize(ftfy.fix_text(text)))
|
||||
text_tokens = []
|
||||
for token in text:
|
||||
|
||||
+514
-1187
File diff suppressed because it is too large
Load Diff
@@ -4,9 +4,7 @@ import json
|
||||
import time
|
||||
from functools import partial
|
||||
import numpy as np
|
||||
# import tensorflow as tf
|
||||
# from tensorflow.python.framework import function
|
||||
from tqdm import tqdm
|
||||
from tqdm import tqdm_notebook as tqdm
|
||||
|
||||
def encode_dataset(*splits, encoder):
|
||||
encoded_splits = []
|
||||
@@ -92,7 +90,7 @@ def iter_data(*datas, n_batch=128, truncate=False, verbose=False, max_batches=fl
|
||||
f = sys.stderr
|
||||
else:
|
||||
f = open(os.devnull, 'w')
|
||||
for i in tqdm(range(0, n, n_batch), total=n//n_batch, file=f, ncols=80, leave=False):
|
||||
for i in tqdm(range(0, n, n_batch), total=n//n_batch, mininterval=10, file=f, ncols=80, leave=False):
|
||||
if n_batches >= max_batches: raise StopIteration
|
||||
if len(datas) == 1:
|
||||
yield datas[0][i:i+n_batch]
|
||||
|
||||
Reference in New Issue
Block a user