{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
" return f(*args, **kwds)\n",
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
" return f(*args, **kwds)\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read labels"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tabledir = \"../tables/\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(772423, 1)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fn = f\"{tabledir}/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz\"\n",
"df_bt = pd.read_csv(fn, usecols=[\"id\", \"BT_case\"])\n",
"df_bt.set_index(\"id\", inplace=True)\n",
"df_bt = ~df_bt.isnull()\n",
"df_bt.columns = [\"digital\"]\n",
"df_bt.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" set | \n",
" label | \n",
" view | \n",
"
\n",
" \n",
" | id | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1013372709_1.2.840.113654.2.70.1.175625299786291545159233542096043464711_3_1 | \n",
" test | \n",
" normal | \n",
" N | \n",
"
\n",
" \n",
" | 1028995243_1.2.840.113654.2.70.1.56947963181878834591544466761404805157_45576_2 | \n",
" test | \n",
" normal | \n",
" N | \n",
"
\n",
" \n",
" | 1105112884_1.2.840.113654.2.70.1.178729598744204462442695104630823323474_8905_2 | \n",
" test | \n",
" normal | \n",
" N | \n",
"
\n",
" \n",
" | 1185125156_1.2.840.113654.2.70.1.45840593750642722243371816041014016032_2_4 | \n",
" test | \n",
" normal | \n",
" N | \n",
"
\n",
" \n",
" | 1496452586_1.2.840.113654.2.70.1.5582568668770891599992528318631583880_1351_4 | \n",
" test | \n",
" normal | \n",
" N | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" set label view\n",
"id \n",
"1013372709_1.2.840.113654.2.70.1.17562529978629... test normal N\n",
"1028995243_1.2.840.113654.2.70.1.56947963181878... test normal N\n",
"1105112884_1.2.840.113654.2.70.1.17872959874420... test normal N\n",
"1185125156_1.2.840.113654.2.70.1.45840593750642... test normal N\n",
"1496452586_1.2.840.113654.2.70.1.55825686687708... test normal N"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/train_test_split-2018-02-16-within7e5-label.csv\"\n",
"dflab = pd.read_csv(infile, index_col='id')\n",
"dflab[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read header-based predictions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(772367, 1)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_glmnet.tab\"\n",
"dfpred_glmnet = pd.read_table(infile, index_col=0)\n",
"dfpred_glmnet.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred_glmnet.columns]\n",
"dfpred_glmnet.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(772367, 5)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" score_gbm | \n",
" score_xgb | \n",
" score_rpart | \n",
" score_xgbt | \n",
" ViewModifierCodeMeaning | \n",
"
\n",
" \n",
" | id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104556 | \n",
" 0.009005 | \n",
" 0.020207 | \n",
" 0.006882 | \n",
" 0.059474 | \n",
" NaN | \n",
"
\n",
" \n",
" | 2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104557 | \n",
" 0.013337 | \n",
" 0.016762 | \n",
" 0.006882 | \n",
" 0.059660 | \n",
" NaN | \n",
"
\n",
" \n",
" | 2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141538 | \n",
" 0.013337 | \n",
" 0.016762 | \n",
" 0.006882 | \n",
" 0.061051 | \n",
" NaN | \n",
"
\n",
" \n",
" | 2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141537 | \n",
" 0.013337 | \n",
" 0.016762 | \n",
" 0.006882 | \n",
" 0.061051 | \n",
" NaN | \n",
"
\n",
" \n",
" | 3337971863_1.2.840.113654.2.70.1.337982194343327746313656933304494759333_1_1 | \n",
" 0.031560 | \n",
" 0.059142 | \n",
" 0.006882 | \n",
" 0.157488 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" score_gbm score_xgb \\\n",
"id \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.009005 0.020207 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.031560 0.059142 \n",
"\n",
" score_rpart score_xgbt \\\n",
"id \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059474 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059660 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.006882 0.157488 \n",
"\n",
" ViewModifierCodeMeaning \n",
"id \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"3337971863_1.2.840.113654.2.70.1.33798219434332... NaN "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_allmodels_trained_on_train.tab\"\n",
"dfpred = pd.read_table(infile, index_col=0)\n",
"dfpred.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred.columns]\n",
"dfpred.index.name = 'id'\n",
"print(dfpred.shape)\n",
"dfpred[:5]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(772367, 8)\n"
]
}
],
"source": [
"if 'set' not in dfpred.columns:\n",
" dfpred = dfpred.merge(dflab, left_index=True, right_index=True, how='left')\n",
" print(dfpred.shape)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"normal 3526\n",
"magn/spot 572\n",
"wire loc 57\n",
"stereotactic 25\n",
"other 9\n",
"Name: view, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colmap = {\"N\":\"normal\", \"M\": \"magn/spot\",\n",
" \"T\":\"stereotactic\", \"W\":\"wire loc\", \"X\":\"other\"}\n",
"view_counts = dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]).value_counts()\n",
"view_counts"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | set | \n",
" train | \n",
" test | \n",
" val | \n",
"
\n",
" \n",
" | view | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | magn/spot | \n",
" 380 | \n",
" 96 | \n",
" 96 | \n",
"
\n",
" \n",
" | normal | \n",
" 2310 | \n",
" 612 | \n",
" 604 | \n",
"
\n",
" \n",
" | other | \n",
" 4 | \n",
" 3 | \n",
" 2 | \n",
"
\n",
" \n",
" | stereotactic | \n",
" 17 | \n",
" 4 | \n",
" 4 | \n",
"
\n",
" \n",
" | wire loc | \n",
" 37 | \n",
" 11 | \n",
" 9 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"set train test val\n",
"view \n",
"magn/spot 380 96 96\n",
"normal 2310 612 604\n",
"other 4 3 2\n",
"stereotactic 17 4 4\n",
"wire loc 37 11 9"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]), dfpred.set)[[\"train\", \"test\", \"val\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read image-based predictions (general)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"../tables//spotmag_predictions/predictions_images_4189-epoch55-e5ce2d69b035975cb5336cec0da9a32a.csv\n"
]
},
{
"data": {
"text/plain": [
"Index(['score_image', 'score_image_max'], dtype='object')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tag = \"e5ce2d69b035975cb5336cec0da9a32a\"\n",
"epoch = 55\n",
"infile = f\"{tabledir}/spotmag_predictions/predictions_images_4189-epoch{epoch}-{tag}.csv\"\n",
"# infile = f\"{tabledir}/spotmag_predictions/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl-spotmag_img_prediction-{tag}.csv\"\n",
"print(infile)\n",
"dfpred_img = pd.read_csv(infile, index_col=0)\n",
"dfpred_img = dfpred_img[['score_image', 'score_image_max']]\n",
"dfpred_img = dfpred_img.groupby(level=0).mean()\n",
"dfpred_img.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read image-based predictions (wire localization)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/predictions_wire_combined_e8e71fc090141d7c6fb334359152d295.csv\"\n",
"\n",
"dfpred_imgwire = pd.read_csv(infile, index_col=0)\n",
"dfpred_imgwire[\"score_wire_max\"] = 1-dfpred_imgwire[[\"scores_0_or\",\"scores_0_fl\"]].min(1)\n",
"dfpred_imgwire = dfpred_imgwire.drop([\"scores_0_or\",\"scores_0_fl\", \"label\"], axis=1)\n",
"dfpred_imgwire.columns = [cc.replace(\"scores\", \"score_wire\") for cc in dfpred_imgwire.columns]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(772367, 13)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if 'score_image' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dfpred_img], axis=1)\n",
" dfpred.index.name = 'id'\n",
" del dfpred_img\n",
" \n",
"if 'score_glmnet' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dfpred_glmnet], axis=1)\n",
" dfpred.index.name = 'id'\n",
" del dfpred_glmnet\n",
" \n",
"if 'score_wire' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dfpred_imgwire], axis=1)\n",
" dfpred.index.name = 'id'\n",
" del dfpred_imgwire\n",
"\n",
"dfpred.shape"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"if 'label' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dflab], axis=1)\n",
"if 'digital' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, df_bt], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | score_image | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" | score_wire | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | False | \n",
" 3584 | \n",
" 0 | \n",
"
\n",
" \n",
" | True | \n",
" 605 | \n",
" 768234 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"score_image False True \n",
"score_wire \n",
"False 3584 0\n",
"True 605 768234"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(dfpred[\"score_wire\"].isnull(), dfpred[\"score_image\"].isnull())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred.rename(columns={\"score_xgbt\":\"score_gbmt\"}, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add ensembled (max, avg) scores"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred['score_wire'] = dfpred['score_wire'].fillna(0)\n",
"dfpred['score_wire_max'] = dfpred['score_wire_max'].fillna(0)\n",
"dfpred['score_image+glmnet'] = (dfpred['score_image'] + dfpred['score_glmnet'])/2\n",
"dfpred['score_image+gbmt'] = (dfpred['score_image'] + dfpred['score_gbmt'])/2\n",
"\n",
"dfpred['score_max(image;gbmt)'] = dfpred[['score_image','score_gbmt']].max(1)\n",
"\n",
"dfpred['score_image*glmnet'] = np.sqrt(dfpred['score_image'] * dfpred['score_glmnet'])\n",
"dfpred['score_image*gbmt'] = np.sqrt(dfpred['score_image'] * dfpred['score_gbmt'])\n",
"dfpred['score_max_image_wire'] = np.nanmax(dfpred[['score_image','score_wire']].values, axis=1)\n",
"dfpred['score_max_image_wire_max'] = np.nanmax(dfpred[['score_image','score_wire_max']].values, axis=1)\n",
"# dfpred['score_wire'].isnull()\n",
"dfpred['score_max_image_wire+gbmt'] =(dfpred['score_max_image_wire'] + dfpred['score_gbmt'])/2\n",
"\n",
"dfpred['score_max_image_wire_max+gbmt'] =(dfpred['score_max_image_wire_max'] + dfpred['score_gbmt'])/2\n",
"\n",
"dfpred['score_max(image;wire_max;gbmt)'] = dfpred[['score_wire_max','score_gbmt', 'score_image']].max(1)\n",
"\n",
"dfpred['score_max_wire_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire']].values, axis=1)\n",
"\n",
"dfpred['score_max_wire_max_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire_max']].values, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred.rename(columns={\"ViewModifierCodeMeaning\":\"ViewModifier\"}, inplace=True)\n",
"dfpred.index.name = 'id'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save the combined table"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"772423"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dfpred)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred.to_csv(f'{tabledir}/all_predictions_with_images-{tag}.tab', sep='\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}