mirror of
https://github.com/wassname/mammoviews.git
synced 2026-06-26 16:00:43 +08:00
764 lines
22 KiB
Plaintext
764 lines
22 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
|
|
" return f(*args, **kwds)\n",
|
|
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
|
|
" return f(*args, **kwds)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import os"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Read labels"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"tabledir = \"../tables/\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(772423, 1)"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fn = f\"{tabledir}/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz\"\n",
|
|
"df_bt = pd.read_csv(fn, usecols=[\"id\", \"BT_case\"])\n",
|
|
"df_bt.set_index(\"id\", inplace=True)\n",
|
|
"df_bt = ~df_bt.isnull()\n",
|
|
"df_bt.columns = [\"digital\"]\n",
|
|
"df_bt.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style>\n",
|
|
" .dataframe thead tr:only-child th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: left;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>set</th>\n",
|
|
" <th>label</th>\n",
|
|
" <th>view</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>id</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>1013372709_1.2.840.113654.2.70.1.175625299786291545159233542096043464711_3_1</th>\n",
|
|
" <td>test</td>\n",
|
|
" <td>normal</td>\n",
|
|
" <td>N</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1028995243_1.2.840.113654.2.70.1.56947963181878834591544466761404805157_45576_2</th>\n",
|
|
" <td>test</td>\n",
|
|
" <td>normal</td>\n",
|
|
" <td>N</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1105112884_1.2.840.113654.2.70.1.178729598744204462442695104630823323474_8905_2</th>\n",
|
|
" <td>test</td>\n",
|
|
" <td>normal</td>\n",
|
|
" <td>N</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1185125156_1.2.840.113654.2.70.1.45840593750642722243371816041014016032_2_4</th>\n",
|
|
" <td>test</td>\n",
|
|
" <td>normal</td>\n",
|
|
" <td>N</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1496452586_1.2.840.113654.2.70.1.5582568668770891599992528318631583880_1351_4</th>\n",
|
|
" <td>test</td>\n",
|
|
" <td>normal</td>\n",
|
|
" <td>N</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" set label view\n",
|
|
"id \n",
|
|
"1013372709_1.2.840.113654.2.70.1.17562529978629... test normal N\n",
|
|
"1028995243_1.2.840.113654.2.70.1.56947963181878... test normal N\n",
|
|
"1105112884_1.2.840.113654.2.70.1.17872959874420... test normal N\n",
|
|
"1185125156_1.2.840.113654.2.70.1.45840593750642... test normal N\n",
|
|
"1496452586_1.2.840.113654.2.70.1.55825686687708... test normal N"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"infile = f\"{tabledir}/spotmag_predictions/train_test_split-2018-02-16-within7e5-label.csv\"\n",
|
|
"dflab = pd.read_csv(infile, index_col='id')\n",
|
|
"dflab[:5]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Read header-based predictions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(772367, 1)"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_glmnet.tab\"\n",
|
|
"dfpred_glmnet = pd.read_table(infile, index_col=0)\n",
|
|
"dfpred_glmnet.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred_glmnet.columns]\n",
|
|
"dfpred_glmnet.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(772367, 5)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style>\n",
|
|
" .dataframe thead tr:only-child th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: left;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>score_gbm</th>\n",
|
|
" <th>score_xgb</th>\n",
|
|
" <th>score_rpart</th>\n",
|
|
" <th>score_xgbt</th>\n",
|
|
" <th>ViewModifierCodeMeaning</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>id</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104556</th>\n",
|
|
" <td>0.009005</td>\n",
|
|
" <td>0.020207</td>\n",
|
|
" <td>0.006882</td>\n",
|
|
" <td>0.059474</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104557</th>\n",
|
|
" <td>0.013337</td>\n",
|
|
" <td>0.016762</td>\n",
|
|
" <td>0.006882</td>\n",
|
|
" <td>0.059660</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141538</th>\n",
|
|
" <td>0.013337</td>\n",
|
|
" <td>0.016762</td>\n",
|
|
" <td>0.006882</td>\n",
|
|
" <td>0.061051</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141537</th>\n",
|
|
" <td>0.013337</td>\n",
|
|
" <td>0.016762</td>\n",
|
|
" <td>0.006882</td>\n",
|
|
" <td>0.061051</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3337971863_1.2.840.113654.2.70.1.337982194343327746313656933304494759333_1_1</th>\n",
|
|
" <td>0.031560</td>\n",
|
|
" <td>0.059142</td>\n",
|
|
" <td>0.006882</td>\n",
|
|
" <td>0.157488</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" score_gbm score_xgb \\\n",
|
|
"id \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.009005 0.020207 \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
|
|
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.031560 0.059142 \n",
|
|
"\n",
|
|
" score_rpart score_xgbt \\\n",
|
|
"id \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059474 \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059660 \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
|
|
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.006882 0.157488 \n",
|
|
"\n",
|
|
" ViewModifierCodeMeaning \n",
|
|
"id \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
|
"3337971863_1.2.840.113654.2.70.1.33798219434332... NaN "
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_allmodels_trained_on_train.tab\"\n",
|
|
"dfpred = pd.read_table(infile, index_col=0)\n",
|
|
"dfpred.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred.columns]\n",
|
|
"dfpred.index.name = 'id'\n",
|
|
"print(dfpred.shape)\n",
|
|
"dfpred[:5]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(772367, 8)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"if 'set' not in dfpred.columns:\n",
|
|
" dfpred = dfpred.merge(dflab, left_index=True, right_index=True, how='left')\n",
|
|
" print(dfpred.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"normal 3526\n",
|
|
"magn/spot 572\n",
|
|
"wire loc 57\n",
|
|
"stereotactic 25\n",
|
|
"other 9\n",
|
|
"Name: view, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"colmap = {\"N\":\"normal\", \"M\": \"magn/spot\",\n",
|
|
" \"T\":\"stereotactic\", \"W\":\"wire loc\", \"X\":\"other\"}\n",
|
|
"view_counts = dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]).value_counts()\n",
|
|
"view_counts"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style>\n",
|
|
" .dataframe thead tr:only-child th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: left;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th>set</th>\n",
|
|
" <th>train</th>\n",
|
|
" <th>test</th>\n",
|
|
" <th>val</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>view</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>magn/spot</th>\n",
|
|
" <td>380</td>\n",
|
|
" <td>96</td>\n",
|
|
" <td>96</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>normal</th>\n",
|
|
" <td>2310</td>\n",
|
|
" <td>612</td>\n",
|
|
" <td>604</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>other</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>2</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>stereotactic</th>\n",
|
|
" <td>17</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>4</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>wire loc</th>\n",
|
|
" <td>37</td>\n",
|
|
" <td>11</td>\n",
|
|
" <td>9</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
"set train test val\n",
|
|
"view \n",
|
|
"magn/spot 380 96 96\n",
|
|
"normal 2310 612 604\n",
|
|
"other 4 3 2\n",
|
|
"stereotactic 17 4 4\n",
|
|
"wire loc 37 11 9"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pd.crosstab(dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]), dfpred.set)[[\"train\", \"test\", \"val\"]]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Read image-based predictions (general)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"../tables//spotmag_predictions/predictions_images_4189-epoch55-e5ce2d69b035975cb5336cec0da9a32a.csv\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Index(['score_image', 'score_image_max'], dtype='object')"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tag = \"e5ce2d69b035975cb5336cec0da9a32a\"\n",
|
|
"epoch = 55\n",
|
|
"infile = f\"{tabledir}/spotmag_predictions/predictions_images_4189-epoch{epoch}-{tag}.csv\"\n",
|
|
"# infile = f\"{tabledir}/spotmag_predictions/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl-spotmag_img_prediction-{tag}.csv\"\n",
|
|
"print(infile)\n",
|
|
"dfpred_img = pd.read_csv(infile, index_col=0)\n",
|
|
"dfpred_img = dfpred_img[['score_image', 'score_image_max']]\n",
|
|
"dfpred_img = dfpred_img.groupby(level=0).mean()\n",
|
|
"dfpred_img.columns"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Read image-based predictions (wire localization)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"infile = f\"{tabledir}/spotmag_predictions/predictions_wire_combined_e8e71fc090141d7c6fb334359152d295.csv\"\n",
|
|
"\n",
|
|
"dfpred_imgwire = pd.read_csv(infile, index_col=0)\n",
|
|
"dfpred_imgwire[\"score_wire_max\"] = 1-dfpred_imgwire[[\"scores_0_or\",\"scores_0_fl\"]].min(1)\n",
|
|
"dfpred_imgwire = dfpred_imgwire.drop([\"scores_0_or\",\"scores_0_fl\", \"label\"], axis=1)\n",
|
|
"dfpred_imgwire.columns = [cc.replace(\"scores\", \"score_wire\") for cc in dfpred_imgwire.columns]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(772367, 13)"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"if 'score_image' not in dfpred.columns:\n",
|
|
" dfpred = pd.concat([dfpred, dfpred_img], axis=1)\n",
|
|
" dfpred.index.name = 'id'\n",
|
|
" del dfpred_img\n",
|
|
" \n",
|
|
"if 'score_glmnet' not in dfpred.columns:\n",
|
|
" dfpred = pd.concat([dfpred, dfpred_glmnet], axis=1)\n",
|
|
" dfpred.index.name = 'id'\n",
|
|
" del dfpred_glmnet\n",
|
|
" \n",
|
|
"if 'score_wire' not in dfpred.columns:\n",
|
|
" dfpred = pd.concat([dfpred, dfpred_imgwire], axis=1)\n",
|
|
" dfpred.index.name = 'id'\n",
|
|
" del dfpred_imgwire\n",
|
|
"\n",
|
|
"dfpred.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"if 'label' not in dfpred.columns:\n",
|
|
" dfpred = pd.concat([dfpred, dflab], axis=1)\n",
|
|
"if 'digital' not in dfpred.columns:\n",
|
|
" dfpred = pd.concat([dfpred, df_bt], axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style>\n",
|
|
" .dataframe thead tr:only-child th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: left;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th>score_image</th>\n",
|
|
" <th>False</th>\n",
|
|
" <th>True</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>score_wire</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>False</th>\n",
|
|
" <td>3584</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>True</th>\n",
|
|
" <td>605</td>\n",
|
|
" <td>768234</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
"score_image False True \n",
|
|
"score_wire \n",
|
|
"False 3584 0\n",
|
|
"True 605 768234"
|
|
]
|
|
},
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pd.crosstab(dfpred[\"score_wire\"].isnull(), dfpred[\"score_image\"].isnull())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"dfpred.rename(columns={\"score_xgbt\":\"score_gbmt\"}, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Add ensembled (max, avg) scores"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"dfpred['score_wire'] = dfpred['score_wire'].fillna(0)\n",
|
|
"dfpred['score_wire_max'] = dfpred['score_wire_max'].fillna(0)\n",
|
|
"dfpred['score_image+glmnet'] = (dfpred['score_image'] + dfpred['score_glmnet'])/2\n",
|
|
"dfpred['score_image+gbmt'] = (dfpred['score_image'] + dfpred['score_gbmt'])/2\n",
|
|
"\n",
|
|
"dfpred['score_max(image;gbmt)'] = dfpred[['score_image','score_gbmt']].max(1)\n",
|
|
"\n",
|
|
"dfpred['score_image*glmnet'] = np.sqrt(dfpred['score_image'] * dfpred['score_glmnet'])\n",
|
|
"dfpred['score_image*gbmt'] = np.sqrt(dfpred['score_image'] * dfpred['score_gbmt'])\n",
|
|
"dfpred['score_max_image_wire'] = np.nanmax(dfpred[['score_image','score_wire']].values, axis=1)\n",
|
|
"dfpred['score_max_image_wire_max'] = np.nanmax(dfpred[['score_image','score_wire_max']].values, axis=1)\n",
|
|
"# dfpred['score_wire'].isnull()\n",
|
|
"dfpred['score_max_image_wire+gbmt'] =(dfpred['score_max_image_wire'] + dfpred['score_gbmt'])/2\n",
|
|
"\n",
|
|
"dfpred['score_max_image_wire_max+gbmt'] =(dfpred['score_max_image_wire_max'] + dfpred['score_gbmt'])/2\n",
|
|
"\n",
|
|
"dfpred['score_max(image;wire_max;gbmt)'] = dfpred[['score_wire_max','score_gbmt', 'score_image']].max(1)\n",
|
|
"\n",
|
|
"dfpred['score_max_wire_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire']].values, axis=1)\n",
|
|
"\n",
|
|
"dfpred['score_max_wire_max_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire_max']].values, axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"dfpred.rename(columns={\"ViewModifierCodeMeaning\":\"ViewModifier\"}, inplace=True)\n",
|
|
"dfpred.index.name = 'id'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Save the combined table"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"772423"
|
|
]
|
|
},
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(dfpred)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"dfpred.to_csv(f'{tabledir}/all_predictions_with_images-{tag}.tab', sep='\\t')"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python [default]",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|