{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", " return f(*args, **kwds)\n", "/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", " return f(*args, **kwds)\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read labels" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tabledir = \"../tables/\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(772423, 1)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fn = f\"{tabledir}/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz\"\n", "df_bt = pd.read_csv(fn, usecols=[\"id\", \"BT_case\"])\n", "df_bt.set_index(\"id\", inplace=True)\n", "df_bt = ~df_bt.isnull()\n", "df_bt.columns = [\"digital\"]\n", "df_bt.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
setlabelview
id
1013372709_1.2.840.113654.2.70.1.175625299786291545159233542096043464711_3_1testnormalN
1028995243_1.2.840.113654.2.70.1.56947963181878834591544466761404805157_45576_2testnormalN
1105112884_1.2.840.113654.2.70.1.178729598744204462442695104630823323474_8905_2testnormalN
1185125156_1.2.840.113654.2.70.1.45840593750642722243371816041014016032_2_4testnormalN
1496452586_1.2.840.113654.2.70.1.5582568668770891599992528318631583880_1351_4testnormalN
\n", "
" ], "text/plain": [ " set label view\n", "id \n", "1013372709_1.2.840.113654.2.70.1.17562529978629... test normal N\n", "1028995243_1.2.840.113654.2.70.1.56947963181878... test normal N\n", "1105112884_1.2.840.113654.2.70.1.17872959874420... test normal N\n", "1185125156_1.2.840.113654.2.70.1.45840593750642... test normal N\n", "1496452586_1.2.840.113654.2.70.1.55825686687708... test normal N" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "infile = f\"{tabledir}/spotmag_predictions/train_test_split-2018-02-16-within7e5-label.csv\"\n", "dflab = pd.read_csv(infile, index_col='id')\n", "dflab[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read header-based predictions" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(772367, 1)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "infile = f\"{tabledir}/spotmag_predictions/all_predictions_glmnet.tab\"\n", "dfpred_glmnet = pd.read_table(infile, index_col=0)\n", "dfpred_glmnet.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred_glmnet.columns]\n", "dfpred_glmnet.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(772367, 5)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
score_gbmscore_xgbscore_rpartscore_xgbtViewModifierCodeMeaning
id
2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_21045560.0090050.0202070.0068820.059474NaN
2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_21045570.0133370.0167620.0068820.059660NaN
2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_21415380.0133370.0167620.0068820.061051NaN
2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_21415370.0133370.0167620.0068820.061051NaN
3337971863_1.2.840.113654.2.70.1.337982194343327746313656933304494759333_1_10.0315600.0591420.0068820.157488NaN
\n", "
" ], "text/plain": [ " score_gbm score_xgb \\\n", "id \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.009005 0.020207 \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n", "3337971863_1.2.840.113654.2.70.1.33798219434332... 0.031560 0.059142 \n", "\n", " score_rpart score_xgbt \\\n", "id \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059474 \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059660 \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n", "3337971863_1.2.840.113654.2.70.1.33798219434332... 0.006882 0.157488 \n", "\n", " ViewModifierCodeMeaning \n", "id \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n", "2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n", "3337971863_1.2.840.113654.2.70.1.33798219434332... NaN " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "infile = f\"{tabledir}/spotmag_predictions/all_predictions_allmodels_trained_on_train.tab\"\n", "dfpred = pd.read_table(infile, index_col=0)\n", "dfpred.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred.columns]\n", "dfpred.index.name = 'id'\n", "print(dfpred.shape)\n", "dfpred[:5]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(772367, 8)\n" ] } ], "source": [ "if 'set' not in dfpred.columns:\n", " dfpred = dfpred.merge(dflab, left_index=True, right_index=True, how='left')\n", " print(dfpred.shape)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "normal 3526\n", "magn/spot 572\n", "wire loc 57\n", "stereotactic 25\n", "other 9\n", "Name: view, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "colmap = {\"N\":\"normal\", \"M\": \"magn/spot\",\n", " \"T\":\"stereotactic\", \"W\":\"wire loc\", \"X\":\"other\"}\n", "view_counts = dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]).value_counts()\n", "view_counts" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
settraintestval
view
magn/spot3809696
normal2310612604
other432
stereotactic1744
wire loc37119
\n", "
" ], "text/plain": [ "set train test val\n", "view \n", "magn/spot 380 96 96\n", "normal 2310 612 604\n", "other 4 3 2\n", "stereotactic 17 4 4\n", "wire loc 37 11 9" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.crosstab(dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]), dfpred.set)[[\"train\", \"test\", \"val\"]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read image-based predictions (general)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "../tables//spotmag_predictions/predictions_images_4189-epoch55-e5ce2d69b035975cb5336cec0da9a32a.csv\n" ] }, { "data": { "text/plain": [ "Index(['score_image', 'score_image_max'], dtype='object')" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tag = \"e5ce2d69b035975cb5336cec0da9a32a\"\n", "epoch = 55\n", "infile = f\"{tabledir}/spotmag_predictions/predictions_images_4189-epoch{epoch}-{tag}.csv\"\n", "# infile = f\"{tabledir}/spotmag_predictions/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl-spotmag_img_prediction-{tag}.csv\"\n", "print(infile)\n", "dfpred_img = pd.read_csv(infile, index_col=0)\n", "dfpred_img = dfpred_img[['score_image', 'score_image_max']]\n", "dfpred_img = dfpred_img.groupby(level=0).mean()\n", "dfpred_img.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read image-based predictions (wire localization)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "infile = f\"{tabledir}/spotmag_predictions/predictions_wire_combined_e8e71fc090141d7c6fb334359152d295.csv\"\n", "\n", "dfpred_imgwire = pd.read_csv(infile, index_col=0)\n", "dfpred_imgwire[\"score_wire_max\"] = 1-dfpred_imgwire[[\"scores_0_or\",\"scores_0_fl\"]].min(1)\n", "dfpred_imgwire = dfpred_imgwire.drop([\"scores_0_or\",\"scores_0_fl\", \"label\"], axis=1)\n", "dfpred_imgwire.columns = [cc.replace(\"scores\", \"score_wire\") for cc in dfpred_imgwire.columns]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(772367, 13)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "if 'score_image' not in dfpred.columns:\n", " dfpred = pd.concat([dfpred, dfpred_img], axis=1)\n", " dfpred.index.name = 'id'\n", " del dfpred_img\n", " \n", "if 'score_glmnet' not in dfpred.columns:\n", " dfpred = pd.concat([dfpred, dfpred_glmnet], axis=1)\n", " dfpred.index.name = 'id'\n", " del dfpred_glmnet\n", " \n", "if 'score_wire' not in dfpred.columns:\n", " dfpred = pd.concat([dfpred, dfpred_imgwire], axis=1)\n", " dfpred.index.name = 'id'\n", " del dfpred_imgwire\n", "\n", "dfpred.shape" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "if 'label' not in dfpred.columns:\n", " dfpred = pd.concat([dfpred, dflab], axis=1)\n", "if 'digital' not in dfpred.columns:\n", " dfpred = pd.concat([dfpred, df_bt], axis=1)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
score_imageFalseTrue
score_wire
False35840
True605768234
\n", "
" ], "text/plain": [ "score_image False True \n", "score_wire \n", "False 3584 0\n", "True 605 768234" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.crosstab(dfpred[\"score_wire\"].isnull(), dfpred[\"score_image\"].isnull())" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfpred.rename(columns={\"score_xgbt\":\"score_gbmt\"}, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add ensembled (max, avg) scores" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfpred['score_wire'] = dfpred['score_wire'].fillna(0)\n", "dfpred['score_wire_max'] = dfpred['score_wire_max'].fillna(0)\n", "dfpred['score_image+glmnet'] = (dfpred['score_image'] + dfpred['score_glmnet'])/2\n", "dfpred['score_image+gbmt'] = (dfpred['score_image'] + dfpred['score_gbmt'])/2\n", "\n", "dfpred['score_max(image;gbmt)'] = dfpred[['score_image','score_gbmt']].max(1)\n", "\n", "dfpred['score_image*glmnet'] = np.sqrt(dfpred['score_image'] * dfpred['score_glmnet'])\n", "dfpred['score_image*gbmt'] = np.sqrt(dfpred['score_image'] * dfpred['score_gbmt'])\n", "dfpred['score_max_image_wire'] = np.nanmax(dfpred[['score_image','score_wire']].values, axis=1)\n", "dfpred['score_max_image_wire_max'] = np.nanmax(dfpred[['score_image','score_wire_max']].values, axis=1)\n", "# dfpred['score_wire'].isnull()\n", "dfpred['score_max_image_wire+gbmt'] =(dfpred['score_max_image_wire'] + dfpred['score_gbmt'])/2\n", "\n", "dfpred['score_max_image_wire_max+gbmt'] =(dfpred['score_max_image_wire_max'] + dfpred['score_gbmt'])/2\n", "\n", "dfpred['score_max(image;wire_max;gbmt)'] = dfpred[['score_wire_max','score_gbmt', 'score_image']].max(1)\n", "\n", "dfpred['score_max_wire_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire']].values, axis=1)\n", "\n", "dfpred['score_max_wire_max_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire_max']].values, axis=1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfpred.rename(columns={\"ViewModifierCodeMeaning\":\"ViewModifier\"}, inplace=True)\n", "dfpred.index.name = 'id'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save the combined table" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "772423" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dfpred)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfpred.to_csv(f'{tabledir}/all_predictions_with_images-{tag}.tab', sep='\\t')" ] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }