using language model from horoscope_language_model

https://github.com/rodgzilla/pytorch-openai-transformer-lm/blob/horoscope_language_model
This commit is contained in:
wassname
2018-11-04 17:38:07 +08:00
parent 2f9faab044
commit 1db17076fe
7 changed files with 960 additions and 1352 deletions
+2 -2
View File
@@ -2,7 +2,7 @@ import os
import csv import csv
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm_notebook as tqdm
from sklearn.utils import shuffle from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@@ -16,7 +16,7 @@ def _rocstories(path):
ct1 = [] ct1 = []
ct2 = [] ct2 = []
y = [] y = []
for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)): for i, line in enumerate(tqdm(list(f), ncols=80, mininterval=10, leave=False)):
if i > 0: if i > 0:
s = ' '.join(line[1:5]) # 4 sentances s = ' '.join(line[1:5]) # 4 sentances
st.append(s) st.append(s)
+376 -103
View File
@@ -40,11 +40,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T05:17:20.517091Z", "end_time": "2018-11-04T08:54:51.977129Z",
"start_time": "2018-11-04T05:17:20.141438Z" "start_time": "2018-11-04T08:54:51.641106Z"
} }
}, },
"outputs": [], "outputs": [],
@@ -62,11 +62,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T05:17:20.567180Z", "end_time": "2018-11-04T08:54:52.021478Z",
"start_time": "2018-11-04T05:17:20.519956Z" "start_time": "2018-11-04T08:54:51.980984Z"
} }
}, },
"outputs": [], "outputs": [],
@@ -134,11 +134,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T05:17:09.263186Z", "end_time": "2018-11-04T08:54:52.038785Z",
"start_time": "2018-11-04T05:17:09.246892Z" "start_time": "2018-11-04T08:54:52.024258Z"
} }
}, },
"outputs": [], "outputs": [],
@@ -225,14 +225,73 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T02:41:00.594210Z", "end_time": "2018-11-04T08:55:45.417570Z",
"start_time": "2018-11-04T02:37:49.143793Z" "start_time": "2018-11-04T08:54:52.040977Z"
} }
}, },
"outputs": [], "outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2e1c657a58ac42f9bb99cc8221138bee",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=56), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" Problem: No title found\n",
"\n",
" Problem: No '*** START' seen\n",
"\n",
" Problem: No '*** END' seen\n",
"\n",
" Problem: No title found\n",
"\n",
" Problem: No '*** START' seen\n",
"\n",
" Problem: No '*** END' seen\n",
"\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-88f9382543da>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# first download index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mindex_url\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"http://www.gutenberg.org/files/{bid:}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_for_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbs4\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"html5lib\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 506\u001b[0m }\n\u001b[1;32m 507\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m )\n\u001b[1;32m 442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;31m# Reset the timeout for the recv() on the socket\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1107\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_set_content_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;31m# default charset of iso-8859-1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1102\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1103\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1105\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/lib/python3.5/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 875\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 877\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 878\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotConnected\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 141\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mSocketTimeout\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.5.3/envs/jupyter3/lib/python3.5/site-packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msource_address\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [ "source": [
"for bid in tqdm(ids):\n", "for bid in tqdm(ids):\n",
" \n", " \n",
@@ -294,14 +353,302 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T05:19:03.832336Z", "end_time": "2018-11-04T08:55:48.451860Z",
"start_time": "2018-11-04T05:18:59.180170Z" "start_time": "2018-11-04T08:55:46.818680Z"
} }
}, },
"outputs": [], "outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>author</th>\n",
" <th>content</th>\n",
" <th>extra</th>\n",
" <th>language</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Albert Mordell</td>\n",
" <td>THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ...</td>\n",
" <td>[Project Gutenberg's The Erotic Motive in Lit...</td>\n",
" <td>English</td>\n",
" <td>The Erotic Motive in Literature</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _Laura Middlet...</td>\n",
" <td>[Project Gutenberg's Laura Middleton; Her Brot...</td>\n",
" <td>English</td>\n",
" <td>Laura Middleton; Her Brother and her Lover</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>J. K. Huysmans</td>\n",
" <td>LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
" <td>[The Project Gutenberg EBook of Là-bas, by J. ...</td>\n",
" <td>English</td>\n",
" <td>Là-bas</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>John Cleland</td>\n",
" <td>MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A...</td>\n",
" <td>[The Project Gutenberg EBook of Memoirs Of Fan...</td>\n",
" <td>English</td>\n",
" <td>Memoirs Of Fanny Hill, A New and Genuine Editi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _Forbidden fru...</td>\n",
" <td>[The Project Gutenberg EBook of Forbidden Frui...</td>\n",
" <td>English</td>\n",
" <td>Forbidden Fruit, Luscious and exciting story a...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Kate Percival</td>\n",
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
" <td>English</td>\n",
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Various</td>\n",
" <td>[Transcriber's Note: The following was proofre...</td>\n",
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
" <td>English</td>\n",
" <td>The Fifteen Comforts of Matrimony: Responses F...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
" <td>English</td>\n",
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Denis Diderot</td>\n",
" <td>_Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr...</td>\n",
" <td>[The Project Gutenberg EBook of Les Bijoux Ind...</td>\n",
" <td>English</td>\n",
" <td>Les Bijoux Indiscrets, or, The Indiscreet Toys</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Anonymous</td>\n",
" <td>THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad...</td>\n",
" <td>[The Project Gutenberg EBook of The Ladies Del...</td>\n",
" <td>English</td>\n",
" <td>The Ladies Delight</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>J. K. Huysmans</td>\n",
" <td>LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n...</td>\n",
" <td>[The Project Gutenberg EBook of La-bas, by J. ...</td>\n",
" <td>English</td>\n",
" <td>La-bas</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 5 (of 6)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Friedrich Karl Forberg</td>\n",
" <td>MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig...</td>\n",
" <td>[The Project Gutenberg EBook of Manual of Cla...</td>\n",
" <td>English</td>\n",
" <td>Manual of Classical Erotology (De figuris Vene...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Anonymous</td>\n",
" <td>[Transcriber's note: Anonymous, _The power of ...</td>\n",
" <td>[The Project Gutenberg EBook of The Power of M...</td>\n",
" <td>English</td>\n",
" <td>The Power of Mesmerism, A Highly Erotic Narrat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Georg Brandes</td>\n",
" <td>MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n...</td>\n",
" <td>[The Project Gutenberg EBook of Main Currents...</td>\n",
" <td>English</td>\n",
" <td>Main Currents in Nineteenth Century Literature...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Anonymous</td>\n",
" <td>The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic...</td>\n",
" <td>[, The Project Gutenberg EBook of The Romance ...</td>\n",
" <td>English</td>\n",
" <td>The Romance of Lust A classic Victorian erotic...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>L. Brovan</td>\n",
" <td>Two Hundred and fifty Copies of this Work have...</td>\n",
" <td>[The Project Gutenberg EBook of Anthologica R...</td>\n",
" <td>English</td>\n",
" <td>Anthologica Rarissima: The Way of a Virgin</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Various</td>\n",
" <td>[Transcriber's Note: The following was proofre...</td>\n",
" <td>[The Project Gutenberg EBook of The Fifteen Co...</td>\n",
" <td>English</td>\n",
" <td>The Fifteen Comforts of Matrimony: Responses f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Kate Percival</td>\n",
" <td>The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga...</td>\n",
" <td>[The Project Gutenberg EBook of The Life and A...</td>\n",
" <td>English</td>\n",
" <td>The Life and Amours of the Beautiful, Gay and ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Havelock Ellis</td>\n",
" <td>VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie...</td>\n",
" <td>[The Project Gutenberg eBook, Studies in the P...</td>\n",
" <td>English</td>\n",
" <td>Studies in the Psychology of Sex, Volume 1 (of 6)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" author content \\\n",
"0 Albert Mordell THE EROTIC MOTIVE IN LITERATURE\\n\\nTHE EROTIC ... \n",
"1 Anonymous [Transcriber's note: Anonymous, _Laura Middlet... \n",
"2 J. K. Huysmans LÀ-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
"3 John Cleland MEMOIRS OF FANNY HILL\\n\\nBy John Cleland\\n\\n_A... \n",
"4 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"5 Anonymous [Transcriber's note: Anonymous, _Forbidden fru... \n",
"6 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
"7 Various [Transcriber's Note: The following was proofre... \n",
"8 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
"9 Denis Diderot _Les Bijoux Indiscrets._\\n\\nOR,\\n\\nThe Indiscr... \n",
"10 Anonymous THE LADIES DELIGHT.\\n\\nCONTAINING,\\n\\nI. An Ad... \n",
"11 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"12 J. K. Huysmans LA-BAS\\n\\n(DOWN THERE)\\n\\nby J.K. HUYSMANS\\n\\n... \n",
"13 Havelock Ellis VOLUME 5 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"14 Friedrich Karl Forberg MANUAL\\n\\nOF\\n\\nClassical Erotology\\n\\n(De fig... \n",
"15 Anonymous [Transcriber's note: Anonymous, _The power of ... \n",
"16 Georg Brandes MAIN CURRENTS IN NINETEEN CENTURY LITERATURE\\n... \n",
"17 Anonymous The Romance of Lust\\n\\n(1873)\\n\\nA classic Vic... \n",
"18 L. Brovan Two Hundred and fifty Copies of this Work have... \n",
"19 Various [Transcriber's Note: The following was proofre... \n",
"20 Kate Percival The Life and Amours\\n\\nOF THE\\n\\nBeautiful, Ga... \n",
"21 Havelock Ellis VOLUME 1 (OF 6)***\\n\\nE-text prepared by Julie... \n",
"\n",
" extra language \\\n",
"0 [Project Gutenberg's The Erotic Motive in Lit... English \n",
"1 [Project Gutenberg's Laura Middleton; Her Brot... English \n",
"2 [The Project Gutenberg EBook of Là-bas, by J. ... English \n",
"3 [The Project Gutenberg EBook of Memoirs Of Fan... English \n",
"4 [The Project Gutenberg eBook, Studies in the P... English \n",
"5 [The Project Gutenberg EBook of Forbidden Frui... English \n",
"6 [The Project Gutenberg EBook of The Life and A... English \n",
"7 [The Project Gutenberg EBook of The Fifteen Co... English \n",
"8 [The Project Gutenberg EBook of The Power of M... English \n",
"9 [The Project Gutenberg EBook of Les Bijoux Ind... English \n",
"10 [The Project Gutenberg EBook of The Ladies Del... English \n",
"11 [The Project Gutenberg eBook, Studies in the P... English \n",
"12 [The Project Gutenberg EBook of La-bas, by J. ... English \n",
"13 [The Project Gutenberg eBook, Studies in the P... English \n",
"14 [The Project Gutenberg EBook of Manual of Cla... English \n",
"15 [The Project Gutenberg EBook of The Power of M... English \n",
"16 [The Project Gutenberg EBook of Main Currents... English \n",
"17 [, The Project Gutenberg EBook of The Romance ... English \n",
"18 [The Project Gutenberg EBook of Anthologica R... English \n",
"19 [The Project Gutenberg EBook of The Fifteen Co... English \n",
"20 [The Project Gutenberg EBook of The Life and A... English \n",
"21 [The Project Gutenberg eBook, Studies in the P... English \n",
"\n",
" title \n",
"0 The Erotic Motive in Literature \n",
"1 Laura Middleton; Her Brother and her Lover \n",
"2 Là-bas \n",
"3 Memoirs Of Fanny Hill, A New and Genuine Editi... \n",
"4 Studies in the Psychology of Sex, Volume 1 (of 6) \n",
"5 Forbidden Fruit, Luscious and exciting story a... \n",
"6 The Life and Amours of the Beautiful, Gay and ... \n",
"7 The Fifteen Comforts of Matrimony: Responses F... \n",
"8 The Power of Mesmerism, A Highly Erotic Narrat... \n",
"9 Les Bijoux Indiscrets, or, The Indiscreet Toys \n",
"10 The Ladies Delight \n",
"11 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
"12 La-bas \n",
"13 Studies in the Psychology of Sex, Volume 5 (of 6) \n",
"14 Manual of Classical Erotology (De figuris Vene... \n",
"15 The Power of Mesmerism, A Highly Erotic Narrat... \n",
"16 Main Currents in Nineteenth Century Literature... \n",
"17 The Romance of Lust A classic Victorian erotic... \n",
"18 Anthologica Rarissima: The Way of a Virgin \n",
"19 The Fifteen Comforts of Matrimony: Responses f... \n",
"20 The Life and Amours of the Beautiful, Gay and ... \n",
"21 Studies in the Psychology of Sex, Volume 1 (of 6) "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import uuid\n", "import uuid\n",
"import pandas as pd\n", "import pandas as pd\n",
@@ -315,33 +662,18 @@
"for infile in os.listdir(dest_dir):\n", "for infile in os.listdir(dest_dir):\n",
" path = os.path.join(dest_dir, infile)\n", " path = os.path.join(dest_dir, infile)\n",
" info = json.load(open(path))\n", " info = json.load(open(path))\n",
" paragraphs = info['content'].split('\\n\\n')\n", " data.append(info)\n",
" for paragraph in paragraphs:\n",
"# sentances = [p for p in paragraph.strip().split('. ')]\n",
" sentances = nltk.sent_tokenize(paragraph)\n",
" if len(sentances)>num_sent:\n",
" for i in range(len(sentances)//num_sent):\n",
" data.append(dict(\n",
" storyid=uuid.uuid4().hex,\n",
" sentence1=sentances[i*5+0][:max_len],\n",
" sentence2=sentances[i*5+1][:max_len],\n",
" sentence3=sentances[i*5+2][:max_len],\n",
" sentence4=sentances[i*5+3][:max_len],\n",
" sentence5=sentances[i*5+4][:max_len],\n",
" AnswerRightEnding=1\n",
" ))\n",
"df = pd.DataFrame(data)\n", "df = pd.DataFrame(data)\n",
"df = df[['storyid', 'sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5', 'AnswerRightEnding']]\n",
"df" "df"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T05:19:03.839834Z", "end_time": "2018-11-04T08:55:59.624853Z",
"start_time": "2018-11-04T05:19:03.835656Z" "start_time": "2018-11-04T08:55:59.621367Z"
} }
}, },
"outputs": [], "outputs": [],
@@ -359,91 +691,32 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 11,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T05:19:09.318413Z", "end_time": "2018-11-04T08:56:56.969876Z",
"start_time": "2018-11-04T05:19:08.980242Z" "start_time": "2018-11-04T08:56:56.965223Z"
} }
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"%matplotlib inline\n", "df = df.rename(columns=dict(content='TEXT'))"
"df['sentence1'].str.len().plot.hist(bins=55)\n",
"df['sentence1'].str.len().max()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 12,
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
"end_time": "2018-11-04T05:19:14.924306Z", "end_time": "2018-11-04T08:56:58.359405Z",
"start_time": "2018-11-04T05:19:14.917985Z" "start_time": "2018-11-04T08:56:58.068470Z"
} }
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"val_idx = int(len(df)*0.7)\n", "df.to_csv('data/erotic_gutenberg_dataset.csv', index=False)"
"df_train = df[:val_idx]\n",
"df_val = df[val_idx:]"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T05:19:15.503886Z",
"start_time": "2018-11-04T05:19:15.384466Z"
}
},
"outputs": [],
"source": [
"df_train.to_csv('data/erotic_gutenberg_TRAIN.csv', index=False)\n",
"df_val.to_csv('data/erotic_gutenberg_VAL.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-04T05:19:16.452211Z",
"start_time": "2018-11-04T05:19:16.281536Z"
}
},
"outputs": [],
"source": [
"# import csv\n",
"# def _rocstories(path):\n",
"# with open(path, encoding='utf_8') as f:\n",
"# f = csv.reader(f)\n",
"# st = []\n",
"# ct1 = []\n",
"# ct2 = []\n",
"# y = []\n",
"# for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):\n",
"# if i > 0:\n",
"# s = ' '.join(line[1:5])\n",
"# c1 = line[5]\n",
"# c2 = line[6]\n",
"# st.append(s)\n",
"# ct1.append(c1)\n",
"# ct2.append(c2)\n",
"# y.append(int(line[-1])-1)\n",
"# return st, ct1, ct2, y\n",
" \n",
"# _rocstories('data/erotic_gutenberg_TRAIN.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
+25 -27
View File
@@ -1,32 +1,5 @@
import torch import torch
class LMLossCompute:
"A Loss compute and train function for multiple choice tasks."
def __init__(self, lm_criterion, opt=None):
self.lm_criterion = lm_criterion
self.opt = opt
def __call__(self, X, Y, M, lm_logits, only_return_losses=False):
# Language modeling loss
if lm_logits is not None:
x_shifted = X[:, :, 1:, 0].contiguous().view(-1) # Shape: 252
M = M.view(-1, M.size(2))
lm_losses = self.lm_criterion(lm_logits, x_shifted)
lm_losses = lm_losses.view(X.size(0) * X.size(1), X.size(2) - 1)
lm_losses = lm_losses * M[:, 1:]
lm_losses = lm_losses.sum(1) / torch.sum(M[:, 1:], 1)
if only_return_losses:
return lm_losses
train_loss = lm_losses.sum()
train_loss.backward()
if self.opt is not None:
self.opt.step()
self.opt.zero_grad()
return train_loss.item()
class MultipleChoiceLossCompute: class MultipleChoiceLossCompute:
"A Loss compute and train function for multiple choice tasks." "A Loss compute and train function for multiple choice tasks."
@@ -93,4 +66,29 @@ class ClassificationLossCompute:
self.opt.zero_grad() self.opt.zero_grad()
return train_loss.item() return train_loss.item()
class LanguageModelingLossCompute:
" A Loss compute and train function for language modeling tasks."
def __init__(self, lm_criterion, opt=None):
self.lm_criterion = lm_criterion
self.opt = opt
# def __call__(self, X, Y, M, clf_logits, lm_logits=None, only_return_losses=False):
def __call__(self, X, Y, M, lm_logits, only_return_losses=False):
# Language modeling loss
x_shifted = X[:, 1:, 0].contiguous().view(-1)
M = M.view(-1, M.size(-1))
lm_losses = self.lm_criterion(lm_logits, x_shifted)
lm_losses = lm_losses.view(X.size(0), X.size(-2) - 1)
lm_losses = lm_losses * M[:, 1:]
lm_losses = lm_losses.sum(1) / torch.sum(M[:, 1:], 1)
if only_return_losses:
return lm_losses
train_loss = lm_losses.sum()
train_loss.backward()
if self.opt is not None:
self.opt.step()
self.opt.zero_grad()
return train_loss.item()
# TODO Implement a LossCompute class for similiraty tasks. # TODO Implement a LossCompute class for similiraty tasks.
+14 -2
View File
@@ -279,8 +279,8 @@ class DoubleHeadModel(nn.Module):
# the three classes correspond to entailment, contradiction and neutral. # the three classes correspond to entailment, contradiction and neutral.
self.task_head = ClfHead(clf_token, cfg, 3) self.task_head = ClfHead(clf_token, cfg, 3)
else: else:
raise ValueError("task_head_type is expected to be 'multiple_choice' "+ raise ValueError("task_head_type is expected to be 'multiple_choice' "
"'similarity', 'inference' or ('classification', n_class) "+ "'similarity', 'inference' or ('classification', n_class) "
"got {task_head_type}.".format(task_head_type=task_head_type)) "got {task_head_type}.".format(task_head_type=task_head_type))
elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \ elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \
task_head_type[0] == 'classification': task_head_type[0] == 'classification':
@@ -298,6 +298,18 @@ class DoubleHeadModel(nn.Module):
return lm_logits, task_logits return lm_logits, task_logits
class LanguageModel(nn.Module):
""" Transformer with language model """
def __init__(self, cfg, vocab=40990, n_ctx=512):
super(LanguageModel, self).__init__()
self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
self.lm_head = LMHead(self.transformer, cfg)
def forward(self, x):
h = self.transformer(x)
lm_logits = self.lm_head(h)
return lm_logits
def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/', def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/',
path_names='./'): path_names='./'):
+2 -2
View File
@@ -3,7 +3,7 @@ import ftfy
import json import json
import spacy import spacy
from tqdm import tqdm from tqdm import tqdm_notebook as tqdm
def get_pairs(word): def get_pairs(word):
""" """
@@ -92,7 +92,7 @@ class TextEncoder(object):
def encode(self, texts, verbose=True): def encode(self, texts, verbose=True):
texts_tokens = [] texts_tokens = []
if verbose: if verbose:
for text in tqdm(texts, ncols=80, leave=False): for text in tqdm(texts, ncols=80, mininterval=10, leave=False):
text = self.nlp(text_standardize(ftfy.fix_text(text))) text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = [] text_tokens = []
for token in text: for token in text:
+539 -1212
View File
File diff suppressed because it is too large Load Diff
+2 -4
View File
@@ -4,9 +4,7 @@ import json
import time import time
from functools import partial from functools import partial
import numpy as np import numpy as np
# import tensorflow as tf from tqdm import tqdm_notebook as tqdm
# from tensorflow.python.framework import function
from tqdm import tqdm
def encode_dataset(*splits, encoder): def encode_dataset(*splits, encoder):
encoded_splits = [] encoded_splits = []
@@ -92,7 +90,7 @@ def iter_data(*datas, n_batch=128, truncate=False, verbose=False, max_batches=fl
f = sys.stderr f = sys.stderr
else: else:
f = open(os.devnull, 'w') f = open(os.devnull, 'w')
for i in tqdm(range(0, n, n_batch), total=n//n_batch, file=f, ncols=80, leave=False): for i in tqdm(range(0, n, n_batch), total=n//n_batch, mininterval=10, file=f, ncols=80, leave=False):
if n_batches >= max_batches: raise StopIteration if n_batches >= max_batches: raise StopIteration
if len(datas) == 1: if len(datas) == 1:
yield datas[0][i:i+n_batch] yield datas[0][i:i+n_batch]