This commit is contained in:
Dmytro S Lituiev
2018-10-12 17:38:36 -07:00
commit 0fede818d7
26 changed files with 12311 additions and 0 deletions
+3
View File
@@ -0,0 +1,3 @@
**/*.hdf5
**/*.csv
+42
View File
@@ -0,0 +1,42 @@
# Code for automatic labeling of special diagnostic mammography views from images and DICOM headers
## DICOM
### Extract selected fields from DICOM headers
dicom_header_extraction/extract_dicom_headers_w_generator_150K.py
### Normalize / expand data
dicom_header_extraction/normalize_selected_dcm_headers.py
### Machine learning on DICOM headers
caret_on_headers.R # most methods
caret_on_headers_nona.R # GLMNET
## Image pipeline
### General image model
- scripts and config files: `image_classifiers/e5ce2d69b035975cb5336cec0da9a32a`
- weight files:
### Wire localization model
- scripts and config files: `image_classifiers/e8e71fc090141d7c6fb334359152d295`
- weight files:
## Visualization of performance metrics
Scripts used to generate Fig. 1
combine_predictions_hdr_and_img.ipynb
visualize_predictions_hdr_and_img.ipynb
## Significance tests
Scripts used to generate Supplementary Figures S1 & S2
calc_auroc_confidence_intervals.R
plot_auroc_difference_pvalue.ipynb
+169
View File
@@ -0,0 +1,169 @@
rm(list=ls())
library(pROC)
library(ggplot2)
library(ggsignif)
library(dplyr)
library(data.table)
read.gz <- function(filename, ...){
as.data.frame(fread(paste("zcat < ",filename),
header=TRUE, fill = TRUE, ...))
}
tag <- "e5ce2d69b035975cb5336cec0da9a32a"
fnall <- "../tables/all_predictions_with_images.tab"
fnall <- paste0("../tables/all_predictions_with_images-", tag,".tab")
predictions <- as.data.frame(fread(fnall, sep='\t'), header=TRUE, fill = TRUE)
labelled <- sapply(predictions$label, function(x) nchar(x)>0)
print(nrow(predictions[labelled,]))
predictions <- predictions[labelled,]
predictions[,'ViewModifier'] <- as.numeric(predictions[,'ViewModifier']!='')
predictions[, "label"] <- factor(predictions[, "label"], c('normal', 'special'))
predictions[,"view"] <- factor(predictions[,"view"], c('N','M','T','W','X'))
head(predictions)
# holdout <- predictions[predictions$set == 'val',]
ggplot(holdout, aes(view, `score_max_wire_image+gbmt`)) + geom_point()
validation <- predictions[predictions$set == 'test',]
clmns <- colnames(predictions)
othercols <- c('id', 'set', 'view', 'label')
modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
'image',
'image_max',
'wire',
'wire_max',
'max_image_wire_max',
'image+gbmt',
'max_wire_max_image+gbmt',
'max_image_wire',
'max_wire_image+gbmt')
clean_score_names <- function(x){
return( gsub('score_', '', x) )
# paste(strsplit(x, '_')[[1]][-1],collapse='_')
}
clmns_clean <- vapply(clmns, clean_score_names, '')
cols_ <- factor(vapply(colnames(predictions) , clean_score_names, ''),
c(othercols,modelnames))
colnames(validation) <- cols_
validation <- validation[,!is.na(colnames(validation))]
cols_ <- cols_[!is.na(cols_)]
cols_ <- cols_[order(cols_)]
validation <- validation[,as.character(cols_)]
colnames(validation)
# clmns <-clmns[vapply(clmns, function(x) strsplit(x, '_')[[1]][1]=='score', TRUE)]
## Perform McNemars test for prediction difference ----------------------------------------------------
mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$max_image_wire_max>0.5))
mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$gbmt>0.5))
## Calculate significance of pairwise auROC differences -----------------------------------------------
cis <- list()
rocobjects <- list()
ii <- 0
for (clmn in modelnames){
# ii = 1
print('====================')
print(clmn)
rocobj <- plot.roc( validation[, "label"],
validation[,clmn],
levels = (levels(validation[, "label"])),
xlim = c(100,0),
ylim = c(0,100),
percent=TRUE,
print.auc=TRUE)
rocobjects[[clmn]] <- rocobj
cis[[clmn]] <- ci(rocobj, of="auc", thresholds="best")
}
## Wire model on wire cases
for (clmn in c('wire', 'wire_max')){
print('====================')
print(clmn)
rocobj <- plot.roc( validation[, "view"]=='W',
validation[,clmn],
# levels = (levels(validation[, "label"])),
xlim = c(100,0),
ylim = c(0,100),
percent=TRUE,
print.auc=TRUE)
rocobjects[[clmn]] <- rocobj
cis[[paste0(clmn, ' (vs other views)')]] <- ci(rocobj, of="auc", thresholds="best")
}
###
modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
'image', "image_max",
'wire', 'wire_max',
'wire (vs other views)', 'wire_max (vs other views)',
'max_image_wire_max',
'image+gbmt',
'max_wire_max_image+gbmt')
##
dfcis <- as.data.frame(t(do.call(cbind.data.frame, lapply(cis, as.vector))))
colnames(dfcis) <- c('lower', 'auROC', 'upper')
dfcis[,"model"] <- factor(rownames(dfcis),
modelnames)
dfcis <- dfcis[!is.na(dfcis[,"model"]),]
rownames(dfcis) <- dfcis[,"model"]
dfcis <- dfcis[modelnames,]
# dfcis <-dfcis %>% mutate(model = factor(model, levels=rev(levels(model))))
dfcis_nowire <- dfcis[!(rownames(dfcis) %in% c('wire','wire_max')),]
dfcis_nowire$model <- factor(dfcis_nowire$model)
#
#
# annotation_df <- data.frame(color=c("E", "H"),
# start=c("Good", "Fair"),
# end=c("Very Good", "Good"),
# y=c(3.6, 4.7),
# label=c("Comp. 1", "Comp. 2"))
roc.test(rocobjects[["ViewModifier"]], rocobjects[["gbmt"]])
## Format Pairwise comparisons
keys <- names(rocobjects)
dfcompar <- data.frame()
for (a in 1:length(rocobjects)){
for (b in 1:a){
na <- keys[a]
nb <- keys[b]
if ((as.numeric(rocobjects[[na]]$auc)==100)||(as.numeric(rocobjects[[nb]]$auc)==100)){
dfcompar[na, nb] <- NA
} else {
dfcompar[na, nb] <- roc.test(rocobjects[[na]], rocobjects[[nb]], method='delong')$p.value
}
}
}
fn.comparison <- paste0("../tables/auroc_delong_comparison-", tag,".csv")
write.csv(dfcompar, file=fn.comparison)
+284
View File
@@ -0,0 +1,284 @@
# coding: utf-8
rm(list=ls())
library(caret)
library(gbm3)
library(data.table)
library(ggplot2)
library(fastmatch)
read.gz <- function(filename, ...){
as.data.frame(fread(paste("zcat < ",filename),
header=TRUE, fill = TRUE, ...))
}
TABLEDIR = "../tables/"
fn_ids = paste(TABLEDIR,
"2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz", sep='/')
ids = read.gz(fn_ids, select="id")$id
fn_features = paste(TABLEDIR, "mammo_dicom_headers/df_all_mammos_dicom_headers_selected_expanded.tab.gz", sep='/')
dffeatures = read.gz(fn_features, sep='\t')
print(nrow(dffeatures))
print(length(ids))
dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
dffeatures <- dffeatures[!is.na(dffeatures$filename),]
rm(ids)
# Data formatting -----------------------------------------
collist = c("BodyPartThickness", "XRayTubeCurrentInuA", "ContentTime",
"DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
for (cc in collist){
dffeatures[,cc] <- as.numeric(dffeatures[,cc])
}
dtypes = sapply(dffeatures, class)
names(dtypes[dtypes == 'character'])
row.names(dffeatures) = dffeatures$filename
excludeCols <- c("filename",
"CollimatorLeftVerticalEdge",
"CollimatorLowerHorizontalEdge",
"DistanceSourceToEntrance",
"ExposuresOnDetectorSinceLastCalibration",
"ExposuresOnDetectorSinceManufactured",
"ShutterLowerHorizontalEdge",
"ShutterRightVerticalEdge",
"XRayTubeCurrentInuA"
# "ManufacturerModelName"
)
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
catcols <- c('ViewModifierCodeMeaning',
'ViewCodeValue',
'DetectorActiveDimensionsMissing',
'FieldOfViewOriginMissing',
'Grid',
'Manufacturer',
'ManufacturerModelName')
for (cc in catcols){
dffeatures[,cc] = as.factor(dffeatures[,cc])
}
#cell#
colSums(sapply(dffeatures, is.na))
# Read labels --------------------------------
fn.labelledset = paste(TABLEDIR, "spotmag_predictions/train_test_split-2018-02-15-within7e5.csv", sep='/')
# filelist.labelled = read.table(fn.labelledset, )
df.labelled = as.data.frame(fread(fn.labelledset))
rownames(df.labelled) <- df.labelled$id
vec.labelled = df.labelled$id
df.labelled$label <- as.factor(df.labelled$label)
#cell#
vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
############################################################
dffeatures.labelled <- dffeatures[vec.labelled,]
dffeatures.labelled$label <- df.labelled$label
#cell#
dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
colnames(dffeatures.labelled.tr_set)
for (cc in colnames(dffeatures.labelled.tr_set)){
if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
if (length(setdiff_)>0){
print(cc)
print(setdiff_)
}
}
}
# GBM3 ----------------------------------------
par_detail <- gbmParallel(num_threads = 4) # Pass to par_details in gbmt
gbmt_fit <- gbmt(label ~ .,
data = dffeatures.labelled.tr_set,
cv_folds = 10,
# training_params = training_params(num_trees = 100,
# interaction_depth = 1,
# min_num_obs_in_node = 10,
# shrinkage = 0.005,
# bag_fraction = 0.5,
# num_features = 2),
keep_gbm_data = TRUE,
par_detail=par_detail)
best_iter_cv <- gbmt_performance(gbmt_fit, method='cv')
plot(best_iter_cv)
best.iter.oob <- gbmt_performance(gbmt_fit,method="OOB") # returns out-of-bag estimated best number of trees
plot(best.iter.oob)
saveRDS(gbmt_fit, sprintf("gbm3_ntrees_%d_%s.rds", best_iter_cv, Sys.Date()))
## Feature Importance Plotting ----------------
infl_gbmt <- (as.data.frame(relative_influence(gbmt_fit, best_iter_cv, rescale=T)))
colnames(infl_gbmt) <- "relative influence"
infl_gbmt[,"variable"] <- rownames(infl_gbmt)
infl_gbmt = infl_gbmt[infl_gbmt$`relative influence` >0,]
plimp <- ggplot(data=infl_gbmt) +
geom_segment(size=5, colour='blue') +
aes(x=reorder(variable,`relative influence`),
xend = variable,
y = 2e-6,
yend=`relative influence`,
label=`relative influence`) +
scale_y_log10() +
# coord_cartesian(ylim= c(0.8e-6, 1.05)) +
ylab("relative influence") + xlab("") +
coord_flip() +
theme(axis.text.y = element_text(colour="black",size=16,angle=0,face="plain"),
axis.text.x = element_text(colour="black",size=16,angle=0,face="plain"),
axis.title.x = element_text(colour="black",size=16,angle=0,face="plain"),
# panel.background = element_rect(fill = "transparent"), # bg of the panel
#plot.background = element_rect(fill = "transparent"), # bg of the plot
# panel.grid.major = element_blank(), # get rid of major grid
# , panel.grid.minor = element_blank() # get rid of minor grid
, legend.background = element_rect(fill = "transparent") # get rid of legend bg
, legend.box.background = element_rect(fill = "transparent") # get rid of legend panel bg
)
plimp + coord_trans(limy= c(0.5e-6, 1.05)) + coord_flip()
plimp + ggsave("img/xgbt_importances.eps", device = 'eps', bg = "transparent",
width = 8, height = 6, dpi = 300, units = "in" )
plimp + ggsave("img/xgbt_importances.png", device = 'png', bg = "transparent",
width = 8, height = 6, dpi = 300, units = "in" )
dffeatures[,"predictions_gbmt"] = predict(gbmt_fit, newdata = dffeatures,
n.trees = best_iter_cv,
type = "response", na.action = na.pass)
# GBM-CARET ---------------------------------------------------
control <- trainControl(method = "cv",
number = 10,
p =.8,
savePredictions = TRUE,
classProbs = TRUE,
summaryFunction = twoClassSummary)
tuneGrid <- expand.grid(n.trees = c(80,100,120,140,160),
shrinkage=c(0.025, 0.05, 0.1, 0.2),
interaction.depth = c(1,2),
n.minobsinnode = c(10, 15))
gbmFit1 <- train(label ~ .,
data = dffeatures.labelled.tr_set,
method = "gbm",
na.action = na.pass,
tuneGrid=tuneGrid,
## This last option is actually one
## for gbm() that passes through
metric = "ROC",
trControl = control,
# importance = TRUE,
verbose = FALSE)
gbmFit1
## Feature Importance Plotting ---------------------------------------------
gbmsmmry <- summary(gbmFit1, normalize=T, plotit=F)
gbmsmmry <- gbmsmmry[gbmsmmry$rel.inf>0,]
ggplot(data=gbmsmmry) +
geom_segment(size=3, colour='red') +
aes(x=reorder(var,rel.inf, sum),
xend = var,
y = 0.002,
yend=(rel.inf),
label=rel.inf) +
scale_y_log10() +
ylab("relative influence") + xlab("") +
coord_flip()
saveRDS(gbmFit1, "gbm_ntrees80_interactiondepth2_shrinkage0.2_nminobsinnode15_trainset_2018-02-18.rds")
dffeatures[,"predictions_gbm"] = predict(gbmFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
# RPART -----------------------------------------------------------------
tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
rpartFit1 <- train(label ~ ., data = dffeatures.labelled.tr_set,
method = "rpart",
na.action = na.pass,
tuneGrid=tuneGrid,
## This last option is actually one
## for gbm() that passes through
metric = "ROC",
trControl = control
)
varImp(rpartFit1)
predictions.ts_set = predict(rpartFit1,
newdata = dffeatures.labelled.ts_set,
type='prob', na.action = na.pass)
dffeatures[,"predictions_rpart"] = predict(rpartFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
# XGB ---------------------------------------------------------------------
control <- trainControl(method="cv", number=10)
#classProbs = TRUE
#tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
xgbFit <- train(label ~ ., data = dffeatures.labelled.tr_set,
method = "xgbTree",
na.action = na.pass,
#tuneGrid=tuneGrid,
metric = "Accuracy",
trControl = control)
varImp(xgbFit, scale=T)
as.data.frame(xgbFit$finalModel$params)
xgbFit$bestTune
saveRDS(xgbFit, sprintf("xgbtree_maxdepth1_subsample1_eta0.3_%s.rds", Sys.Date()))
predictions.ts_set = predict(xgbFit,
newdata = dffeatures.labelled.ts_set,
type='prob', na.action = na.pass)
## Save all predictions ---------------------------------------------------------
dffeatures[,"predictions_xgb"] = predict(xgbFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
write.table(dffeatures[, c(grep('prediction',colnames(dffeatures), value=T),
"ViewModifierCodeMeaning", "ViewCodeValue")],
file = "all_predictions_allmodels_trained_on_train.tab", quote=F, sep='\t')
+170
View File
@@ -0,0 +1,170 @@
# coding: utf-8
############################################################################
# stratify by BT column: those are 100% sure digital, others can be either
############################################################################
rm(list=ls())
setwd(dir = "~/repos/mammo/learn_spotmag_from_dicom_headers")
#cell#
library(caret)
library(data.table)
library(pROC)
# install.packages(c("pROC"))
library(ggplot2)
library(fastmatch)
read.gz <- function(filename, ...){
as.data.frame(fread(paste("zcat < ",filename),
header=TRUE, fill = TRUE, ...))
}
fn_ids = "../tables/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz"
ids = read.gz(fn_ids, select="id")$id
fn_features = "../tables/mammo_dicom_headers/df_all_mammos_dicom_headers_selected_nona.tab.gz"
dffeatures = read.gz(fn_features, sep='\t')
# rownames(dffeatures) <- dffeatures$filename
print(nrow(dffeatures))
print(length(ids))
dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
dffeatures <- dffeatures[!is.na(dffeatures$filename),]
rm(ids)
collist = c("BodyPartThickness", "XRayTubeCurrentInuA", "ContentTime",
"DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
for (cc in collist){
dffeatures[,cc] <- as.numeric(dffeatures[,cc])
}
# (head(as.numeric(dffeatures$BodyPartThickness)))
dtypes = sapply(dffeatures, class)
row.names(dffeatures) = dffeatures$filename
excludeCols <- c("filename",
"CollimatorLeftVerticalEdge",
"CollimatorLowerHorizontalEdge",
"DistanceSourceToEntrance",
"ExposuresOnDetectorSinceLastCalibration",
"ExposuresOnDetectorSinceManufactured",
"ShutterLowerHorizontalEdge",
"ShutterRightVerticalEdge",
"XRayTubeCurrentInuA"
# "ManufacturerModelName"
)
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
catcols <- c('ViewModifierCodeMeaning',
'ViewCodeValue',
'DetectorActiveDimensionsMissing',
'FieldOfViewOriginMissing',
'Grid',
'Manufacturer',
'ManufacturerModelName')
for (cc in catcols){
dffeatures[,cc] = paste0("=", dffeatures[,cc])
dffeatures[,cc] = as.factor(dffeatures[,cc])
}
dffeatures[,"HighBit"] <- as.numeric(dffeatures[,"HighBit"])
colSums(sapply(dffeatures, is.na))
# Read labels ---------------------------------
fn.labelledset = "../tables/spotmag_predictions/train_test_split-2018-02-15-within7e5.csv"
# filelist.labelled = read.table(fn.labelledset, )
df.labelled = as.data.frame(fread(fn.labelledset))
rownames(df.labelled) <- df.labelled$id
vec.labelled = df.labelled$id
df.labelled$label <- as.factor(df.labelled$label)
#cell#
vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
############################################################
dffeatures.labelled <- dffeatures[vec.labelled,]
dffeatures.labelled$label <- df.labelled$label
dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
table(dffeatures.labelled.tr_set$label)
goodrows <- 1 - colSums(sapply(dffeatures.labelled.tr_set, is.na)) / nrow(dffeatures.labelled.tr_set)
names(goodrows[goodrows<0.1])
for (cc in colnames(dffeatures.labelled.tr_set)){
if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
if (length(setdiff_)>0){
print(cc)
print(setdiff_)
}
}
}
# GLMNET ---------------------------------------------------------------------
library(glmnet)
# Using glmnet to directly perform CV
set.seed(0)
x_train <- model.matrix( ~ .-1, dffeatures.labelled.tr_set[,!(colnames(dffeatures.labelled.tr_set) %in% c("label"))])
dim(x_train)
cvob1=cv.glmnet(x=x_train,
y=dffeatures.labelled.tr_set[,"label"],
family="binomial",alpha=1,
type.measure="auc", nfolds = 5, lambda = seq(0.001,0.1,by = 0.001),
standardize=FALSE)
plot(cvob1)
control <- trainControl(method="cv", number=5, returnResamp="all",
classProbs=TRUE, summaryFunction=twoClassSummary)
#classProbs = TRUE
tuneGrid <- expand.grid(alpha=c(0.00, 0.25, 0.50, 0.75, 0.99, 1.00), lambda = 10^seq(-5,-2,0.5))
tune = list()
fits = list()
rocs = list()
for (ii in 1:5){
glmnetFit <- train(label ~ ., data = dffeatures.labelled.tr_set,
method = "glmnet",
na.action = na.pass,
tuneGrid=tuneGrid,
metric = "ROC",
trControl = control)
fits[[ii]] <- glmnetFit
tune[[ii]] <- glmnetFit$bestTune
rocs[[ii]] <- max(glmnetFit$results$ROC)
}
tune
varImp(glmnetFit, scale=T)
as.data.frame(glmnetFit$bestTune)
saveRDS(glmnetFit, sprintf("glmnet.rds", Sys.Date()))
## Save predictions ---------------------------------------------------------
dffeatures[,"predictions_glmnet"] = predict(glmnetFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
write.table(dffeatures[,c("predictions_glmnet"), drop=F],
file="all_predictions_glmnet.tab", quote=F, sep='\t')
+763
View File
@@ -0,0 +1,763 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
" return f(*args, **kwds)\n",
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
" return f(*args, **kwds)\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read labels"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tabledir = \"../tables/\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(772423, 1)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fn = f\"{tabledir}/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz\"\n",
"df_bt = pd.read_csv(fn, usecols=[\"id\", \"BT_case\"])\n",
"df_bt.set_index(\"id\", inplace=True)\n",
"df_bt = ~df_bt.isnull()\n",
"df_bt.columns = [\"digital\"]\n",
"df_bt.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>set</th>\n",
" <th>label</th>\n",
" <th>view</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1013372709_1.2.840.113654.2.70.1.175625299786291545159233542096043464711_3_1</th>\n",
" <td>test</td>\n",
" <td>normal</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1028995243_1.2.840.113654.2.70.1.56947963181878834591544466761404805157_45576_2</th>\n",
" <td>test</td>\n",
" <td>normal</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1105112884_1.2.840.113654.2.70.1.178729598744204462442695104630823323474_8905_2</th>\n",
" <td>test</td>\n",
" <td>normal</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1185125156_1.2.840.113654.2.70.1.45840593750642722243371816041014016032_2_4</th>\n",
" <td>test</td>\n",
" <td>normal</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1496452586_1.2.840.113654.2.70.1.5582568668770891599992528318631583880_1351_4</th>\n",
" <td>test</td>\n",
" <td>normal</td>\n",
" <td>N</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" set label view\n",
"id \n",
"1013372709_1.2.840.113654.2.70.1.17562529978629... test normal N\n",
"1028995243_1.2.840.113654.2.70.1.56947963181878... test normal N\n",
"1105112884_1.2.840.113654.2.70.1.17872959874420... test normal N\n",
"1185125156_1.2.840.113654.2.70.1.45840593750642... test normal N\n",
"1496452586_1.2.840.113654.2.70.1.55825686687708... test normal N"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/train_test_split-2018-02-16-within7e5-label.csv\"\n",
"dflab = pd.read_csv(infile, index_col='id')\n",
"dflab[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read header-based predictions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(772367, 1)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_glmnet.tab\"\n",
"dfpred_glmnet = pd.read_table(infile, index_col=0)\n",
"dfpred_glmnet.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred_glmnet.columns]\n",
"dfpred_glmnet.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(772367, 5)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score_gbm</th>\n",
" <th>score_xgb</th>\n",
" <th>score_rpart</th>\n",
" <th>score_xgbt</th>\n",
" <th>ViewModifierCodeMeaning</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104556</th>\n",
" <td>0.009005</td>\n",
" <td>0.020207</td>\n",
" <td>0.006882</td>\n",
" <td>0.059474</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104557</th>\n",
" <td>0.013337</td>\n",
" <td>0.016762</td>\n",
" <td>0.006882</td>\n",
" <td>0.059660</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141538</th>\n",
" <td>0.013337</td>\n",
" <td>0.016762</td>\n",
" <td>0.006882</td>\n",
" <td>0.061051</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141537</th>\n",
" <td>0.013337</td>\n",
" <td>0.016762</td>\n",
" <td>0.006882</td>\n",
" <td>0.061051</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3337971863_1.2.840.113654.2.70.1.337982194343327746313656933304494759333_1_1</th>\n",
" <td>0.031560</td>\n",
" <td>0.059142</td>\n",
" <td>0.006882</td>\n",
" <td>0.157488</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score_gbm score_xgb \\\n",
"id \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.009005 0.020207 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.031560 0.059142 \n",
"\n",
" score_rpart score_xgbt \\\n",
"id \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059474 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059660 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.006882 0.157488 \n",
"\n",
" ViewModifierCodeMeaning \n",
"id \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
"3337971863_1.2.840.113654.2.70.1.33798219434332... NaN "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_allmodels_trained_on_train.tab\"\n",
"dfpred = pd.read_table(infile, index_col=0)\n",
"dfpred.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred.columns]\n",
"dfpred.index.name = 'id'\n",
"print(dfpred.shape)\n",
"dfpred[:5]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(772367, 8)\n"
]
}
],
"source": [
"if 'set' not in dfpred.columns:\n",
" dfpred = dfpred.merge(dflab, left_index=True, right_index=True, how='left')\n",
" print(dfpred.shape)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"normal 3526\n",
"magn/spot 572\n",
"wire loc 57\n",
"stereotactic 25\n",
"other 9\n",
"Name: view, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colmap = {\"N\":\"normal\", \"M\": \"magn/spot\",\n",
" \"T\":\"stereotactic\", \"W\":\"wire loc\", \"X\":\"other\"}\n",
"view_counts = dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]).value_counts()\n",
"view_counts"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>set</th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" <th>val</th>\n",
" </tr>\n",
" <tr>\n",
" <th>view</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>magn/spot</th>\n",
" <td>380</td>\n",
" <td>96</td>\n",
" <td>96</td>\n",
" </tr>\n",
" <tr>\n",
" <th>normal</th>\n",
" <td>2310</td>\n",
" <td>612</td>\n",
" <td>604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>other</th>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>stereotactic</th>\n",
" <td>17</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>wire loc</th>\n",
" <td>37</td>\n",
" <td>11</td>\n",
" <td>9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"set train test val\n",
"view \n",
"magn/spot 380 96 96\n",
"normal 2310 612 604\n",
"other 4 3 2\n",
"stereotactic 17 4 4\n",
"wire loc 37 11 9"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]), dfpred.set)[[\"train\", \"test\", \"val\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read image-based predictions (general)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"../tables//spotmag_predictions/predictions_images_4189-epoch55-e5ce2d69b035975cb5336cec0da9a32a.csv\n"
]
},
{
"data": {
"text/plain": [
"Index(['score_image', 'score_image_max'], dtype='object')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tag = \"e5ce2d69b035975cb5336cec0da9a32a\"\n",
"epoch = 55\n",
"infile = f\"{tabledir}/spotmag_predictions/predictions_images_4189-epoch{epoch}-{tag}.csv\"\n",
"# infile = f\"{tabledir}/spotmag_predictions/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl-spotmag_img_prediction-{tag}.csv\"\n",
"print(infile)\n",
"dfpred_img = pd.read_csv(infile, index_col=0)\n",
"dfpred_img = dfpred_img[['score_image', 'score_image_max']]\n",
"dfpred_img = dfpred_img.groupby(level=0).mean()\n",
"dfpred_img.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read image-based predictions (wire localization)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"infile = f\"{tabledir}/spotmag_predictions/predictions_wire_combined_e8e71fc090141d7c6fb334359152d295.csv\"\n",
"\n",
"dfpred_imgwire = pd.read_csv(infile, index_col=0)\n",
"dfpred_imgwire[\"score_wire_max\"] = 1-dfpred_imgwire[[\"scores_0_or\",\"scores_0_fl\"]].min(1)\n",
"dfpred_imgwire = dfpred_imgwire.drop([\"scores_0_or\",\"scores_0_fl\", \"label\"], axis=1)\n",
"dfpred_imgwire.columns = [cc.replace(\"scores\", \"score_wire\") for cc in dfpred_imgwire.columns]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(772367, 13)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"if 'score_image' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dfpred_img], axis=1)\n",
" dfpred.index.name = 'id'\n",
" del dfpred_img\n",
" \n",
"if 'score_glmnet' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dfpred_glmnet], axis=1)\n",
" dfpred.index.name = 'id'\n",
" del dfpred_glmnet\n",
" \n",
"if 'score_wire' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dfpred_imgwire], axis=1)\n",
" dfpred.index.name = 'id'\n",
" del dfpred_imgwire\n",
"\n",
"dfpred.shape"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"if 'label' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, dflab], axis=1)\n",
"if 'digital' not in dfpred.columns:\n",
" dfpred = pd.concat([dfpred, df_bt], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>score_image</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>score_wire</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>3584</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>605</td>\n",
" <td>768234</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"score_image False True \n",
"score_wire \n",
"False 3584 0\n",
"True 605 768234"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(dfpred[\"score_wire\"].isnull(), dfpred[\"score_image\"].isnull())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred.rename(columns={\"score_xgbt\":\"score_gbmt\"}, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add ensembled (max, avg) scores"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred['score_wire'] = dfpred['score_wire'].fillna(0)\n",
"dfpred['score_wire_max'] = dfpred['score_wire_max'].fillna(0)\n",
"dfpred['score_image+glmnet'] = (dfpred['score_image'] + dfpred['score_glmnet'])/2\n",
"dfpred['score_image+gbmt'] = (dfpred['score_image'] + dfpred['score_gbmt'])/2\n",
"\n",
"dfpred['score_max(image;gbmt)'] = dfpred[['score_image','score_gbmt']].max(1)\n",
"\n",
"dfpred['score_image*glmnet'] = np.sqrt(dfpred['score_image'] * dfpred['score_glmnet'])\n",
"dfpred['score_image*gbmt'] = np.sqrt(dfpred['score_image'] * dfpred['score_gbmt'])\n",
"dfpred['score_max_image_wire'] = np.nanmax(dfpred[['score_image','score_wire']].values, axis=1)\n",
"dfpred['score_max_image_wire_max'] = np.nanmax(dfpred[['score_image','score_wire_max']].values, axis=1)\n",
"# dfpred['score_wire'].isnull()\n",
"dfpred['score_max_image_wire+gbmt'] =(dfpred['score_max_image_wire'] + dfpred['score_gbmt'])/2\n",
"\n",
"dfpred['score_max_image_wire_max+gbmt'] =(dfpred['score_max_image_wire_max'] + dfpred['score_gbmt'])/2\n",
"\n",
"dfpred['score_max(image;wire_max;gbmt)'] = dfpred[['score_wire_max','score_gbmt', 'score_image']].max(1)\n",
"\n",
"dfpred['score_max_wire_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire']].values, axis=1)\n",
"\n",
"dfpred['score_max_wire_max_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire_max']].values, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred.rename(columns={\"ViewModifierCodeMeaning\":\"ViewModifier\"}, inplace=True)\n",
"dfpred.index.name = 'id'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save the combined table"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"772423"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dfpred)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfpred.to_csv(f'{tabledir}/all_predictions_with_images-{tag}.tab', sep='\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@@ -0,0 +1,98 @@
# coding: utf-8
import numpy as np
import pandas as pd
import dicom
from warnings import warn
def get_tuples(plan, outlist = None, key = ""):
if len(key)>0:
key = key + "_"
if not outlist:
outlist = []
for aa in plan.dir():
if (hasattr(plan, aa) and aa!='PixelData'):
value = getattr(plan, aa)
if type(value) is dicom.sequence.Sequence:
# if len(list(value))==1:
# outlist.extend(get_tuples(list(value)[0], outlist = None, key = key+aa))
# else:
for nn, ss in enumerate(list(value)):
newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa])
outlist.extend(get_tuples(ss, outlist = None, key = newkey))
else:
if type(value) is dicom.valuerep.DSfloat:
value = float(value)
elif type(value) is dicom.valuerep.IS:
value = str(value)
elif type(value) is dicom.valuerep.MultiValue:
value = tuple(value)
elif type(value) is dicom.UID.UID:
value = str(value)
outlist.append((key + aa, value))
return outlist
def filter_row_common_field(row, common_fields):
for kk in list(row.keys()):
if kk not in common_fields:
row.pop(kk)
return row
"""
fn_allheaders = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/dicom_headers_all_fields_filelist_nonscreening_4000_seed42.csv'
df_allheaders = pd.read_csv(fn_allheaders, index_col=0)
"at least 5% of rows are there"
thr = 0.05
valid_fields = (~df_allheaders.isnull()).mean() > thr
valid_fields = valid_fields[valid_fields].index.tolist()
print(len(valid_fields))
"""
valid_fields = pd.read_table("/data/dlituiev/learn_spotmag_from_dicom_headers/LogisticRegression_common_fields_names.tab",
header=None,
squeeze=True).values
#filelist_fn = '/home/dlituiev/data_dlituiev/tables/df_newest_mammos.pickle'
filelist_fn = "/home/dlituiev/data_dlituiev/tables/2017-06-mammo_tables/df_original_mammos.pickle"
filelist = pd.read_pickle(filelist_fn, )["Filename"].unique().tolist()
len(filelist)
BUFFER_N_LINES = 100
SEP = '\t'
outpath = filelist_fn.replace('.pickle','') + '_dicom_headers_selected.tab'
final_columns = ['filename'] + list(valid_fields)
print("len(final_columns)", len(final_columns) )
print('saving to %s' % outpath)
with open(outpath, 'w+') as outfh:
outfh.write(SEP.join(final_columns) + '\n')
headerlist = []
for nn, ff in enumerate(filelist):
if nn% BUFFER_N_LINES == (BUFFER_N_LINES-1):
df_hl = pd.DataFrame( headerlist, columns=final_columns)
df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
outfh.flush()
del df_hl
print(nn+1)
headerlist = []
try:
plan = dicom.read_file(ff)
row = get_tuples(plan)
row = dict(row)
row = tuple([ff] + [(row[kk] if (kk in row) else np.nan) for kk in valid_fields ])
print("len(row)", len(row))
headerlist.append(row)
except Exception as ex:
# raise ex
warn('header extraction failed on #\t%s\t%s\t%s' % (nn, ff, ex))
# in the end, print the rest:
df_hl = pd.DataFrame( headerlist, columns=final_columns)
df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
outfh.flush()
print("DONE")
+798
View File
@@ -0,0 +1,798 @@
# coding: utf-8
import numpy as np
import pandas as pd
import os
from functools import partial
from itertools import chain
def entropy(x):
f = x.value_counts()
# f.loc["nan"] = x.isnull().sum()
return (f*f.map(np.log2)).sum()
def select_text_fields(df_allheaders):
text_fields = df_allheaders.dtypes.map(lambda x: x is pd.np.dtype(object))
text_fields = text_fields[text_fields].index.tolist()
len(text_fields)
text_fields = (~df_allheaders[text_fields].isnull()).mean() > 0.05
text_fields = text_fields[text_fields].index.tolist()
remove_list = []
for tt in text_fields:
numunique = len(df_allheaders[tt].unique())
entr = entropy(df_allheaders[tt])
if entr<1000 | (numunique == 1) | (numunique > 0.75*df_allheaders.shape[1]):
remove_list.append(tt)
for tt in remove_list:
text_fields.remove(tt)
len(text_fields)
return text_fields
def get_good_numeric_fields(df_allheaders, thr_stderr = 1e-6):
stderr = df_allheaders.std()/df_allheaders.mean()
field_list = stderr[stderr> thr_stderr].index.tolist()
return field_list
def get_index_from_int_tuple(x, ind):
if type(x) is str:
x = eval(x)
return int(float(x[ind]))
else:
return x
def clean_up_field_list(field_list,
prefices_remove = ["date", "accession", "number",
"Filename",
"ImageLaterality",
"GantryID",
#"0_ViewCodeSequence_CodeMeaning",
"ViewCodeSequence_CodeMeaning",
"ViewModifierCodeSequence_CodeValue",
"EthnicGroup",
"BodyPartExamined",
"LossyImageCompression",
"DeidentificationMethodCodeSequence",
"UID",
'EntranceDoseInmGy',
'ProcedureCodeSequence_CodeMeaning',
'CommentsOnRadiationDose',
'DetectorID',
'SeriesDescription', # potentially informative but too many values
'SoftwareVersions',
'PatientAge',
],
fields_remove = [ 'PatientID', 'PatientName', "BitsStored",
'AcquisitionTime',
'AdmittingTime',
'ScheduledStudyStartTime',
'InstanceCreationTime',
'PerformedProcedureStepStartTime',
'PregnancyStatus',
'StudyArrivalTime',
'StudyCompletionTime',
'StudyTime',
'TimeOfLastCalibration',
'TimeOfLastDetectorCalibration',
'TimeOfSecondaryCapture',]):
prefices_remove = [x.lower() for x in prefices_remove]
for ff in field_list:
for pp in prefices_remove:
if pp in ff.lower():
if ff not in fields_remove:
fields_remove.append(ff)
for ff in fields_remove:
try:
field_list.remove(ff)
except ValueError as ve:
print(ff, ve)
return field_list
def make_lowercase_text_fields(df_allheaders):
"""## make all text fields lowercase
(except accession and file name)"""
for cname in df_allheaders.columns[1:]:
cc = df_allheaders[cname]
if cc.dtype is np.dtype(object):
df_allheaders[cname] = cc.str.lower()
return df_allheaders
def format_PixelSpacing(x):
if type(x) is float:
return x
else:
xstr = x.lstrip("(").rstrip(")").replace("'", "").replace(" ","").split(",")
return np.unique(tuple([float(y) for y in xstr]))[0]
def parse_float(x):
x = str(x).replace("'","").replace("b","").replace("None","nan")
if x == "":
x = np.nan
return x
def parse_float_tuples(x, to_int=False):
x = list(str(x))
for nn,ss in enumerate(x):
if not ss.isdigit() and ss!='.':
x[nn] = ';'
x = "".join(x).split(';')
if to_int:
x = tuple([int(float(dd)) for dd in x if len(dd)])
else:
x = tuple([float(dd) for dd in x if len(dd)])
if type(x) is not tuple:
raise TypeError("returned non-list: {}".format(str(x)))
return x
def parse_float_tuples_prod(x):
if x not in (None, np.nan) and len(x)>0:
x = str(x)
assert type(x) is str
x = parse_float_tuples(x)
if type(x) is not tuple:
raise TypeError("returned non-list: {} of type {}".format(str(x), type(x)))
try:
x = np.prod(x)
except TypeError as ee:
print('"%s"' % x)
raise ee
else:
x = np.nan
return x
def parse_int_tuples_median(x):
x = parse_float_tuples(x)
x = np.median(x)
return x
"""
def parse_float_tuples(x):
x = eval(x) if type(x) is str else x
if type(x) in [tuple, list]:
x = tuple([float(y) for y in x])
return x
"""
def parse_str_tuples(x):
try:
x = eval(x) if type(x) is str else x
except:
x = tuple(x.split(" ")) if type(x) is str else x
return x
#############################33
def extract_list_text_field(df_allheaders, colprefix = "ViewModifierCodeSequence_CodeMeaning"):
allcols = df_allheaders.columns
cols = allcols[np.asarray(allcols.map(lambda x: colprefix in x and x!=colprefix), dtype=bool)]
ViewModifierCodeSequence_CodeMeaning = set()
for cc in cols:
ViewModifierCodeSequence_CodeMeaning |= set(df_allheaders[cc].dropna().unique())
for vv in (True, False):
if (vv in ViewModifierCodeSequence_CodeMeaning):
ViewModifierCodeSequence_CodeMeaning.remove(vv)
ViewModifierCodeSequence_CodeMeaning = dict(zip(
ViewModifierCodeSequence_CodeMeaning,
[None]*len(ViewModifierCodeSequence_CodeMeaning)))
for kk in ViewModifierCodeSequence_CodeMeaning.keys():
ViewModifierCodeSequence_CodeMeaning[kk] = df_allheaders[cols[0]].copy()
ViewModifierCodeSequence_CodeMeaning[kk][:] = False
ViewModifierCodeSequence_CodeMeaning[kk] = \
ViewModifierCodeSequence_CodeMeaning[kk].astype(bool)
for cc in cols:
ViewModifierCodeSequence_CodeMeaning[kk] |= df_allheaders[cc].map(lambda x: kk in x if type(x) is str else False)
ViewModifierCodeSequence_CodeMeaning = pd.DataFrame(ViewModifierCodeSequence_CodeMeaning)
ViewModifierCodeSequence_CodeMeaning.columns = \
ViewModifierCodeSequence_CodeMeaning.columns.map(lambda x: colprefix + "_" + x.replace(" ",""))
for cc in cols:
df_allheaders.drop(cc, axis=1, inplace=True)
df_allheaders = pd.concat([df_allheaders, ViewModifierCodeSequence_CodeMeaning], axis=1)
return df_allheaders
#############################33
def normalize_fields(df_allheaders):
# ## Clean up
# ### PixelSpacing
if "PatientAge" in df_allheaders.columns:
df_allheaders.PatientAge = df_allheaders.PatientAge.map(lambda x: int(x.lower().rstrip('y')))
if "DetectorActiveDimensions" in df_allheaders.columns:
df_allheaders.DetectorActiveDimensions = df_allheaders.DetectorActiveDimensions.map(parse_float_tuples_prod)
#df_allheaders.DetectorActiveDimensions = list(map(parse_float_tuples_prod,
# df_allheaders.DetectorActiveDimensions.tolist()))
if "PixelSpacing" in df_allheaders.columns:
df_allheaders.PixelSpacing = df_allheaders["PixelSpacing"].map(format_PixelSpacing)
if "ImagerPixelSpacing" in df_allheaders.columns:
df_allheaders.ImagerPixelSpacing = df_allheaders["ImagerPixelSpacing"].map(format_PixelSpacing)
if "ModalitiesInStudy" in df_allheaders.columns:
df_allheaders["ModalitiesInStudy"] = df_allheaders["ModalitiesInStudy"].map(lambda x: "mg" in str(x))
if "HalfValueLayer" in df_allheaders.columns:
df_allheaders["HalfValueLayer"] = df_allheaders["HalfValueLayer"].map(lambda x: x if type(x) is float else float(str(x).replace('b','').replace("'", '')))
# ### FieldOfViewDimensions
# computing area and filling in the gaps with the mode **worsens** the FNR
# df_allheaders['FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].map(lambda x: np.prod([int(y) for y in eval(x)]) if type(x) is str else x)
# df_allheaders.loc[df_allheaders['FieldOfViewDimensions'].isnull(), 'FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].value_counts().argmax()
# df_allheaders["PartialView"].map(lambda x: type(x)).value_counts()
if "ViewPosition" in df_allheaders.columns:
df_allheaders["ViewPosition"] = df_allheaders["ViewPosition"].map(lambda x: x in ['cc', 'mlo'])
df_allheaders = extract_list_text_field(df_allheaders,
colprefix = "ViewModifierCodeSequence_CodeMeaning")
#df_allheaders = extract_list_text_field(df_allheaders,
# colprefix = "ViewModifierCodeSequence_CodeMeaning")
# ### BreastImplantPresent
# #### clean up
if "BreastImplantPresent" in df_allheaders.columns:
# BreastImplantPresent = pd.Series([np.nan]*df_allheaders.shape[0])
#BreastImplantPresent = pd.Series([False]*df_allheaders.shape[0])
#BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)] = True
BreastImplantPresent = df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)
# BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "no" in x)] = False
df_allheaders['BreastImplantPresent'] = BreastImplantPresent
del BreastImplantPresent
if "PartialView" in df_allheaders:
df_allheaders["PartialView"] = df_allheaders["PartialView"].map(lambda x : "yes" in x if type(x) is str else False)
for kk in ["WindowWidth", "WindowCenter"]:
if kk in df_allheaders.columns:
df_allheaders[kk] = df_allheaders[kk].map(parse_int_tuples_median)
if "PatientOrientation" in df_allheaders.columns:
df_allheaders.PatientOrientation = df_allheaders.PatientOrientation.map(parse_str_tuples)
if "DetectorElementPhysicalSize" in df_allheaders.columns:
df_allheaders["DetectorElementPhysicalSize"] = df_allheaders.DetectorElementPhysicalSize.map(parse_float_tuples)
# ### Grid
# df_allheaders["Grid"].value_counts()
if "Grid" in df_allheaders.columns:
df_allheaders["Grid"] = (df_allheaders["Grid"]
.map(str)
.map(lambda x: x.replace('(','')
.replace(')','')
.replace("'","")
.replace(',','')
.replace("parrallel", "parallel")))
df_allheaders.loc[df_allheaders["Grid"] == "('reciprocating', 'parrallel')", "Grid"] = "('reciprocating', 'parallel')"
df_allheaders["Grid"].value_counts()
# df_allheaders.PixelSpacing = df_allheaders.PixelSpacing.astype(str)
# df_allheaders.PixelSpacing.value_counts()
if "FieldOfViewOrigin" in df_allheaders.columns:
df_allheaders["FieldOfViewOrigin_x"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 0))
df_allheaders["FieldOfViewOrigin_y"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 1))
df_allheaders.drop("FieldOfViewOrigin", axis=1, inplace=True)
#informative_cols.remove("FieldOfViewOrigin")
#informative_cols.append("FieldOfViewOrigin_x")
#informative_cols.append("FieldOfViewOrigin_y")
if "FocalSpots" in df_allheaders.columns:
df_allheaders.loc[df_allheaders["FocalSpots"].isnull(), "FocalSpots"] = df_allheaders["FocalSpots"].value_counts().argmax()
for kk in ["PixelSpacing", "EstimatedRadiographicMagnificationFactor", "XRayTubeCurrent", "DistanceSourceToPatient"]:
# print(kk)
if kk in df_allheaders.columns:
df_allheaders.loc[df_allheaders[kk].isnull(), kk] = df_allheaders[kk].median()
if "ImageType" in df_allheaders.columns:
keywords = set(chain(*(df_allheaders.ImageType.map(lambda x: parse_str_tuples(x)).tolist())))
keywords.remove("")
for kk in keywords:
key = "ImageType"+"_"+kk
df_allheaders[key] = df_allheaders.ImageType.map(lambda x: kk in x)
df_allheaders.drop("ImageType", axis=1, inplace=True)
return df_allheaders
def move_digits_back(allcolumns):
allcolumns = list(allcolumns)
for nn, x in enumerate(allcolumns):
if x[0] in set(list('0123456789')):
x = "_".join(x.split("_")[1:] + x.split("_")[:1])
allcolumns[nn] = x
return allcolumns
def get_features(df_allheaders, thr_stderr = 1e-6):
# df_allheaders.columns = move_digits_back(df_allheaders.columns)
df_allheaders = normalize_fields(df_allheaders.copy())
text_fields = select_text_fields(df_allheaders)
# df_allheaders[text_fields].apply(entropy).hist()
if thr_stderr >0:
field_list = get_good_numeric_fields(df_allheaders,thr_stderr=thr_stderr)
field_list = list(set(clean_up_field_list(field_list + text_fields)))
df_allheaders = make_lowercase_text_fields(df_allheaders)
# pd.crosstab(df_allheaders['0_ViewCodeSequence_CodeMeaning'], df_allheaders['ViewPosition'])
# informative_cols = ['Filename', 'AccessionNumber','BreastImplantPresent','DistanceSourceToPatient','EstimatedRadiographicMagnificationFactor',
# 'FocalSpots','Grid','PixelSpacing','XRayTubeCurrent', 'ViewPosition', 'PartialView']
informative_cols = ['Filename', 'AccessionNumber'] + field_list
feature_columns = informative_cols[2:]
noncategorical = ['ContentTime',
'FieldOfViewOrigin_x',
'FieldOfViewOrigin_y',
'HalfValueLayer',
'WindowWidth',
'CompressionForce',
'DetectorActiveDimensions',
'RelativeXRayExposure',
'ExposureTime',
'Exposure',
'BodyPartThickness',
'FieldOfViewOrigin_y',
'CollimatorLowerHorizontalEdge',
'WindowCenter',
'FieldOfViewRotation',
'KVP',
'DistanceSourceToDetector',
'DistanceSourceToEntrance',
'CollimatorLeftVerticalEdge',
'DetectorTemperature',
'HighBit']
categorical = ['Manufacturer',
'ManufacturerModelName',
'Grid_htc',
'ViewModifierCodeSequence_CodeMeaning',
'ViewModifierCodeSequence_CodeMeaning']
noncategorical = list(set(feature_columns) & set(noncategorical))
potentially_categorical = (set(feature_columns) - set(noncategorical))
potentially_categorical |= set(categorical) & set(df_allheaders.columns)
potentially_categorical = list(potentially_categorical)
print("potentially_categorical", len(potentially_categorical))
print("non_categorical", len(noncategorical))
for cc in noncategorical:
if str(df_allheaders[cc].dtype) == 'object':
df_allheaders[cc] = df_allheaders[cc].map(parse_float).astype(float)
if len(potentially_categorical)>0:
df_allheaders[potentially_categorical] = df_allheaders[potentially_categorical].fillna('unknown')
features_onehot = pd.get_dummies(df_allheaders[potentially_categorical],
drop_first=True, prefix_sep='=')
features_onehot = pd.concat([features_onehot, df_allheaders[noncategorical]], axis=1)
else:
print("no features to binarise!")
features_onehot = df_allheaders[non_categorical].copy()
#features_onehot = pd.concat([df_allheaders.Filename, features_onehot],axis=1,).set_index("Filename")
features_onehot.shape, features_onehot.dropna().shape
# ### Map DICOM file name to PNG file name (remove directories)
#features_onehot.index = features_onehot.index.map(lambda x: "_".join(x.split("/")[-4:]).replace(".dcm", ".png")).tolist()
for cc in features_onehot.columns[features_onehot.isnull().any()]:
print("filling in with median:\t%s" % cc)
features_onehot.loc[features_onehot[cc].isnull(),cc] = \
features_onehot[cc].median()
features_onehot = features_onehot.loc[:,~features_onehot.isnull().any()]
onehotcols = np.asarray(features_onehot.columns[features_onehot.dtypes.map(lambda x : x is pd.np.dtype("uint8"))].tolist())
thr_frac = 0.01
bad_feature_cols = onehotcols[(features_onehot[onehotcols].sum(0) < 5) |
(features_onehot[onehotcols].mean(0) < thr_frac) |
(features_onehot[onehotcols].mean(0) > (1-thr_frac))]
len(bad_feature_cols)
features_onehot.drop(bad_feature_cols, axis=1, inplace=True)
if "FocalSpots" in features_onehot:
features_onehot.loc[features_onehot["FocalSpots"].isnull(), "FocalSpots"] = \
features_onehot["FocalSpots"].value_counts().argmax()
return features_onehot
#############################
if __name__ == '__main__':
PREFIX="allfeatures"
# !sudo pip3 install dicom
# # read a table of DICOM headers
filelist_fn = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/filelist_nonscreening_4000_seed42.csv'
outpath = os.path.join(os.path.dirname(filelist_fn), "dicom_headers_all_fields_" + os.path.basename(filelist_fn))
print(outpath)
df_allheaders = pd.read_csv(outpath, index_col=0)
features_onehot = get_features(df_allheaders)
# ## Read labels
fn_man_labels = "/data/dlituiev/tables/cleaned_manual_labels_valset_4000.txt"
df = pd.read_table(fn_man_labels, index_col=0)
df.index = df.index.map(lambda x : x.split("/")[-1])
# process labels
df["special_view"] = df["regular_view"].map(lambda x: not x)
dfm = pd.merge(df[["special_view"]], features_onehot, how='left', left_index=True, right_index=True)
dfm.shape
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
plt.matplotlib.rcParams["hatch.color"] = [0.7]*3
dfm.var()
dfm.isnull().sum()
dfm.plot(x='special_view', y='XRayTubeCurrent', kind='scatter', alpha=0.05)
dfm.plot(x='special_view', y='DistanceSourceToPatient', kind='scatter', alpha=0.05)
dfm["special_view"].isnull().sum()
target = dfm["special_view"]
features = dfm.drop("special_view", axis=1)
from sklearn.utils import shuffle
# for building and visualizing the decision tree
from sklearn.naive_bayes import GaussianNB, BernoulliNB
# from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# visualization
from vis_tree import visualize_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (accuracy_score, auc, confusion_matrix, f1_score,
precision_score, roc_curve, precision_recall_curve)
y_dev, y_val, X_dev, X_val = train_test_split(target, features, random_state=0, test_size=1/6)
y_tr, y_ts, X_tr, X_ts = train_test_split(y_dev, X_dev, random_state=0, test_size=1/5)
# dtree = DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, criterion="entropy")
# dtree = RandomForestClassifier(min_samples_split=10, min_samples_leaf=5)
# dtree = AdaBoostClassifier(base_estimator=dtree, n_estimators=60, learning_rate=0.01)
# dtree = AdaBoostClassifier(base_estimator=GaussianNB(), n_estimators=50, learning_rate=0.01)
dtree = GradientBoostingClassifier(max_depth=8, n_estimators=40, learning_rate=0.05, min_samples_leaf=12)
modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
dtree.fit(X_tr, y_tr)
pred_y_ts = dtree.predict(X_ts)
pred_yscore_ts = dtree.predict_proba(X_ts)
get_ipython().magic('pinfo auc')
pr_, rec_, thresholds = precision_recall_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
# auc_pr = auc(pr_, rec_)
plt.plot(pr_, rec_)
plt.xlabel('Precision')
plt.ylabel('Recall')
# plt.title('auPRC = {0:.2f}%'.format(auc_pr))
plt.xlim([0,1])
plt.ylim([0,1])
plt.axis('equal')
plt.axis('square')
print("%.2f" % (100*auc_))
frmt = 'png'
plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
fpr_, tpr_, thresholds = roc_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
auc_ = auc(fnr_, tpr_)
plt.plot(fpr_, tpr_)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC = {0:.2f}%'.format(auc_))
plt.axis('equal')
plt.axis('square')
print("%.2f" % (100*auc_))
frmt = 'png'
plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
# pd.DataFrame(dict(FNR=fnr_, TPR=tpr_, threshold=thresholds))
features.plot(x="EstimatedRadiographicMagnificationFactor", y="PixelSpacing", kind='scatter')
fig,ax = plt.subplots(1, figsize=(6,14))
feat_imp = pd.Series(dtree.feature_importances_, index=features.columns)
feat_imp = feat_imp[feat_imp>0.0].sort_values()[::-1]
feat_imp[::-1].plot(kind='barh', ax=ax)
print(feat_imp)
# plt.xlim([0,0.5])
# plt.tight_layout()
frmt = 'png'
plt.savefig("{}_{}_feature_importances.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
len(thresholds)
# pd.DataFrame(dict(
# FNR=fnr_,
# TPR=tpr_,
# threshold = thresholds))
df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
df_confusion
confusion_matrix(y_ts, pred_y_ts)
cm = confusion_matrix(y_ts, pred_y_ts)
cm[1,0]/cm[1,:].sum()
def fnr(dtree, X_val, y_val, thr = None):
if not thr:
pred_y_val = dtree.predict(X_val)
else:
pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
# df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
# pd.Series(pred_y_val, name="predicted"))
# out = df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
cm = confusion_matrix(y_val, pred_y_val)
out = cm[1,0]/cm[1,:].sum()
return out
def fpr(dtree, X_val, y_val, thr = None):
if not thr:
pred_y_val = dtree.predict(X_val)
else:
pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
# df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
# pd.Series(pred_y_val, name="predicted"))
# out = df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
cm = confusion_matrix(y_val, pred_y_val)
if cm[0,:].sum() !=0:
out = cm[0,1]/cm[0,:].sum()
else:
out = 0.0
return out
THR = 0.15
# True | False
# True TP | FN
# False FP | TN
#
#
# FPR = FP / (FP + TN)
#
pred_y_ts = dtree.predict_proba(X_ts)[:,1] > THR
df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
print(df_confusion.to_csv(sep='|'))
THR = 0.05
modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
tmpstr = """model: {}
threshold = {}
+ on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
+ in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
modelname, THR,
100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
100*cv_fnr.mean(), 100*cv_fpr.mean())
print(tmpstr)
THR = 0.5
modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
tmpstr = """model: {}
threshold = {}
+ on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
+ in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
modelname, THR,
100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
100*cv_fnr.mean(), 100*cv_fpr.mean())
print(tmpstr)
6/72
# ## fnr
# 0.1443 -- AdaBoostClassifier(50, lr=0.1) with:
#
#
# DecisionTreeClassifier(max_depth=7, min_samples_leaf=5, criterion="entropy")
# GaussianNB()
#
# 0.1134 -- AdaBoostClassifier(50, lr=0.01) with:
# GaussianNB()
accuracy_score(y_true=y_val, y_pred=pred_y_val)
f1_score(y_true=y_val, y_pred=pred_y_val)
confusion_matrix(y_true=y_val, y_pred=pred_y_val)
df_confusion = pd.crosstab(pd.Series(y_val.as_matrix(), name="observed"),
pd.Series(pred_yscore_dev[:,1]>0.15, name="predicted"))
df_confusion
df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
109/(385+109)
# ## Misclassified: examples and comments
# pred_false = (pd.Series(pred_y_val, name="predicted")==False)
pred_false = (pd.Series(pred_yscore_dev[:,1]<0.15, name="predicted")==False)
false_negatives = (pd.Series(y_val.as_matrix(), name="observed")) & pred_false
false_negatives.index=y_val.index
false_negatives.shape, df.shape
# y_val[false_negatives.tolist()].shape
xstr = """1805162996_1.2.840.113654.2.70.1.75424722723272471565664976911416714890_2_37.png -- implant?
1433463766_1.2.840.113654.2.70.1.243422935316700791950696878743366703411_6_6.png -- male?
3395322213_1.2.840.113654.2.70.1.161905211577383187509354224390811944382_1161_7.png -- overexposed with scale grid
1383662805_1.2.840.113654.2.70.1.194667288082835549565211946781626641146_1_88.png -- mag? bars in the image
5717508670_1.2.840.113654.2.70.1.135196805563780165444562848954663016070_2_6.png -- spot
1582554801_1.2.840.113654.2.70.1.202883517655342643705007475928329105895_1_1.png -- strange shape; plate
3248534628_1.2.840.113654.2.70.1.153327658320065917717726871735320153117_14_8.png -- RLMID, implant
1050998385_1.2.840.113654.2.70.1.294672228525412928579179278566440354700_168_12.png -- RMLO, underexposed, plate
2431514667_1.2.840.113654.2.70.1.132697486450403983700631264913146412468_1_1.png -- regular CC
2836025574_1.2.840.113654.2.70.1.94728406891527814842052605970255602447_31728_4.png -- regular CC, wire?
2774547752_1.2.840.113654.2.70.1.152335331945150793610356395498084601027_47428_6.png -- poor exposure?
6784971236_1.2.840.113654.2.70.1.276140387730485551768768734852859745761_21705_2.png -- regular CC
6120027884_1.2.840.113654.2.70.1.202389441802705593488291262945242015864_28128_3.png -- spot
2127109953_1.2.840.113654.2.70.1.136443797025605972119376095795980286524_5_26.png -- RML, scar
5015120217_1.2.840.113654.2.70.1.8576402180164318136049174781190805706_19615_3.png -- regular MLO, underexposure
2915273528_1.2.840.113654.2.70.1.50904067248781976561131370015339684052_3_51.png -- RLM
2859796079_1.2.840.113654.2.70.1.248757700026158935826319533755178408586_3_51.png -- LMLO, scar""".split("\n")
df_misclassified_comments = pd.DataFrame([x.split(" -- ") for x in xstr], columns=["Filename", "comment"]).applymap(lambda x: x.rstrip().lstrip()).set_index("Filename")["comment"]
df_misclassified_comments
df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
X_val.columns
# X_val[false_negatives][['ViewPosition_ccid', 'ViewPosition_lm', 'ViewPosition_lmid',
# 'ViewPosition_ml', 'ViewPosition_mlo', 'ViewPosition_mloid',
# 'ViewPosition_xccl', "FieldOfViewDimensions_('145', '105')"]]
X_val[false_negatives][['ViewPosition',
'ViewModifierCodeSequence']]
@@ -0,0 +1,97 @@
# coding: utf-8
#cell#
import pandas as pd
import sys
from header_cleaner import get_features, normalize_fields, parse_float_tuples, parse_float
#cell#
fn_features = "../tables/df_all_mammos_dicom_headers_selected.tab.gz"
outfn = "../tables/df_all_mammos_dicom_headers_selected_norm.tab"
dffeatures = pd.read_table(fn_features, index_col="filename")
#cell#
mask_nonnumeric = ~dffeatures["ContentTime"].map(lambda x: isinstance(x, float) | isinstance(x, int))
dffeatures.loc[mask_nonnumeric, "ContentTime"] = dffeatures["ContentTime"][mask_nonnumeric].map(lambda x: float(x.replace(':','').replace('--',"30")))
#cell#
print("shape", dffeatures.shape)
#cell#
normalize_fun = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning":
lambda x: str(x).lower(),
"0_ViewCodeSequence_CodeValue": lambda x: str(x),
"Grid": lambda x: str(x).replace("'","")
.replace("(","").replace(")","")
.replace(",","").replace("/"," ")
.replace('PARRALLEL',"PARALLEL")
.lower(),
"HighBit": lambda x: str(int(x)) if (isinstance(x, float) and x*1==x) else str(x),
"WindowCenter": lambda x: np.median(parse_float_tuples(x)),
"FieldOfViewOrigin":parse_float_tuples,
"EstimatedRadiographicMagnificationFactor": lambda x: x,
"ContentTime": lambda x: x,
"FieldOfViewRotation": lambda x: float(parse_float(x)),
"KVP": lambda x: float(parse_float(x)),
"ShutterLowerHorizontalEdge": lambda x: float(parse_float(x)),
"ShutterRightVerticalEdge": lambda x: float(parse_float(x)),
"XRayTubeCurrentInuA": lambda x: float(parse_float(x)),
"RelativeXRayExposure": lambda x: float(parse_float(x)),
"ManufacturerModelName": lambda x: str(x).lower().replace('"',''),
"Manufacturer": lambda x: str(x).lower().replace('"','').replace(',', '').replace(" inc", "").rstrip('.'),
"BodyPartThickness":lambda x: float(parse_float(x)),
"CollimatorLeftVerticalEdge": lambda x: float(parse_float(x)),
"CollimatorLowerHorizontalEdge": lambda x: float(parse_float(x)),
"DetectorActiveDimensions" : lambda x: parse_float_tuples(x.replace("\\", ", ") if isinstance(x, str) else x),
"ExposureTime": lambda x: x,
"ExposuresOnDetectorSinceLastCalibration": lambda x: x,
"ExposuresOnDetectorSinceManufactured": lambda x: x,
"DistanceSourceToEntrance": lambda x: x,
"DetectorTemperature":lambda x: float(parse_float(x)),
"DistanceSourceToDetector": lambda x: x,
}
dtypes = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning": str,
"0_ViewCodeSequence_CodeValue": str,
"Grid": str,
"HighBit": str, # int
"WindowCenter": int,
"FieldOfViewOrigin": 'O',
"EstimatedRadiographicMagnificationFactor": float,
"ContentTime": float, #NaN
"FieldOfViewRotation": float,
"KVP": float,
"ShutterLowerHorizontalEdge": float,
"ShutterRightVerticalEdge": float,
"XRayTubeCurrentInuA": float,
"RelativeXRayExposure": float,
"ManufacturerModelName": str,
"Manufacturer": str,
"BodyPartThickness": float,
"CollimatorLeftVerticalEdge": float,
"CollimatorLowerHorizontalEdge": float,
"DetectorActiveDimensions" : 'O',
"ExposureTime": float,
"ExposuresOnDetectorSinceLastCalibration": float, # NaNs
"ExposuresOnDetectorSinceManufactured": float, # NaNs
"DistanceSourceToEntrance": float,
"DetectorTemperature": float,
"DistanceSourceToDetector": float,
}
#cell#
set(dffeatures.columns) - set(normalize_fun.keys())
#cell#
for kk, vv in dffeatures.items():
print(kk)
dffeatures.loc[:,kk] = vv.map(normalize_fun[kk]).astype(dtypes[kk])
dffeatures.to_csv(outfn, sep='\t', compression='gzip')
@@ -0,0 +1,48 @@
ReduceLROnPlateau:
cooldown: 32
epsilon: 0.001
factor: 0.5
min_lr: 1.0e-08
mode: auto
monitor: val_loss
patience: 32
verbose: 0
base_trainable: true
batch_size: 256
class_mode: binary
class_weights: null
classes:
- normal
- special
contrast: null
data_augmentation: true
data_train: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train
data_val: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test
dropout: 0.5
fill_mode: reflect
final_activation: sigmoid
height_shift_range: 0.125
horizontal_flip: true
init_epoch: 0
loss_weights: null
lr: 0.0001
n_classes: 1
nb_epoch: 500
ndense: 0
oversampling: false
pretrained: true
rotation_range: 15
samplewise_center: false
seed: 2
target_side: 99
target_size:
- 99
- 99
truncate_quantile: null
vertical_flip: false
weightfile: null
width_shift_range: 0.125
zoom_range:
- 0.8
- 1.2
ztransform: false
@@ -0,0 +1 @@
../inception_short.py
@@ -0,0 +1,185 @@
# coding: utf-8
import sys
import pandas as pd
sys.path.append('../..')
from inception_short import get_model, get_num_files, get_class_weights
from keras.optimizers import Adam
from image import ImageDataGenerator
# from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
from shutil import copy2
from functools import partial
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
import os
import yaml
import numpy as np
import keras
from hashlib import md5
os.environ["PYTHONHASHSEED"]='0'
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
if os.environ["CUDA_VISIBLE_DEVICES"] == '':
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
indir = "./"
import yaml
with open(os.path.join(indir, "checkpoint.info")) as chkpt_fh:
prms = AttrDict(yaml.load(chkpt_fh))
print("\n".join(["%s\t%s" %(kk,vv) for kk,vv in prms.items()]),)
weightfile = os.environ["WFILE"]
#weightfile = "model.175-0.068012.hdf5"
prms['weightfile'] = weightfile
prms['weightfile'] = os.path.join(indir, prms['weightfile'])
prms['weightfile']
# In[6]:
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
print("loss:", prms["loss"])
# CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
#########################################
if prms.weightfile:
print("LOADING WEIGHTS FROM:\t%s" % prms.weightfile)
# model.load_weights(prms.weightfile)
model = load_model(prms.weightfile)
# In[22]:
flowfromdir_params = dict(
# color_mode = "grayscale",
target_size=prms.target_size,
batch_size=prms.batch_size,
class_mode=prms.class_mode,
classes=prms.classes,
seed=prms.seed)
norm_params = dict(
#rescale=prms.scaleup,
samplewise_center=prms.samplewise_center,
samplewise_std_normalization=prms.samplewise_center,
featurewise_center=False,
featurewise_std_normalization=False,
zca_whitening=False,
)
# In[23]:
train_datagen = ImageDataGenerator(**norm_params)
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]#*2**-8
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
#stratify = prms.oversampling,
#sampling_factor=prms.sampling_factor,
#oversampling=prms.oversampling,
shuffle=False, **flowfromdir_params)
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
##########################################
def get_predictions(data_dir,
preprocessing_function = lambda x:x,
model=model):
if isinstance(preprocessing_function, str):
if preprocessing_function == 'fliplr':
preprocessing_function = lambda x: x[...,::-1,:]
elif preprocessing_function in ('identity', 'orig'):
preprocessing_function = lambda x:x
else:
raise ValueError('unknown preprocessing_function:\t%s'
% preprocessing_function)
val_datagen = ImageDataGenerator(**norm_params)
val_datagen.preprocessing_function = preprocessing_function
datagen_val_output = val_datagen.flow_from_directory(
data_dir,
shuffle=False, **flowfromdir_params)
gen_ = datagen_val_output
yhat = model.predict_generator(gen_,
steps=len(gen_),
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":gen_.filenames, "label": gen_.classes})
dfres = pd.DataFrame(dfdict)
return dfres
##########################################
# HOLDOUT
##########################################
data_holdout = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_val'
dfres = get_predictions(
data_holdout,
preprocessing_function = lambda x:x,
model=model)
dfres.to_csv("predictions_val.csv", index=False)
##########################################
preprocessing_function = lambda x: x[...,::-1,:]
dfres = get_predictions(
data_holdout,
preprocessing_function = preprocessing_function,
model=model)
dfres.to_csv("predictions_val_fliplr.csv", index=False)
##########################################
# Test
##########################################
dfres = get_predictions(
prms.data_val,
preprocessing_function = lambda x:x,
model=model)
dfres.to_csv("predictions_test.csv", index=False)
##########################################
preprocessing_function = lambda x: x[...,::-1,:]
dfres = get_predictions(
prms.data_val,
preprocessing_function = preprocessing_function,
model=model)
dfres.to_csv("predictions_test_fliplr.csv", index=False)
##########################################
# TRAIN
##########################################
dfres = get_predictions(
prms.data_train,
preprocessing_function = lambda x:x,
model=model)
dfres.to_csv("predictions_train.csv", index=False)
##########################################
preprocessing_function = lambda x: x[...,::-1,:]
dfres = get_predictions(
prms.data_train,
preprocessing_function = preprocessing_function,
model=model)
dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1,239 @@
from inception_short import get_model, get_num_files, get_class_weights
from keras.optimizers import Adam
from image import ImageDataGenerator
#from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
from shutil import copy2
from functools import partial
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
if __name__ == '__main__':
import sys
import os
import yaml
import numpy as np
import keras
from hashlib import md5
os.environ["PYTHONHASHSEED"]='0'
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
prms = AttrDict(
dropout=0.5,
base_trainable=True,
horizontal_flip = True,
vertical_flip = False,
zoom_range = [0.8, 1.2],
rotation_range = 15,
fill_mode='reflect',
ndense=0,
batch_size = 128*2,
init_epoch=0,
nb_epoch = 500,
data_augmentation = True,
contrast = None, #0.8,
truncate_quantile = None,#0.001,
ztransform = False,
oversampling = False,
#sampling_factor = None, [1, 6, 16, 64, 4],
seed=2,
width_shift_range = 0.125,
height_shift_range = 0.125,
class_mode = 'binary', # 'binary', #
n_classes = 1,
final_activation = 'sigmoid',
lr = 1e-4,
samplewise_center = False, #True
target_side = 99,
weightfile = None,
pretrained = True,
data_train = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train',
data_val = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test',
classes = ['normal', 'special'],
class_weights=None,#[1, 1, 4, 8, 4],
loss_weights = None,
ReduceLROnPlateau = dict(
monitor='val_loss',
factor=1/2,
patience=32,
verbose=0,
mode='auto', epsilon=0.001,
cooldown=32,
min_lr=1e-8,
),
# lr_cyclic_schedule = dict(
# #lr_init = 1.0e-3,
# drop = 2/5,
# epochs_drop = 20,
# cycle_len = 200.0
# )
)
paramhash = md5(str(prms).encode()).hexdigest()
prms["target_size"] = [ prms.target_side ]*2
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
# copy the script to the checkpoint directory
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
yaml.dump(dict(prms), outfh, default_flow_style=False)
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
print("loss:", prms["loss"])
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
#########################################
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
save_best_only=False, save_weights_only=False, mode='auto', period=1)
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
callback_list = [checkpoint, csv_callback]
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
elif "lr_cyclic_schedule" in prms:
callback_list.append(
LearningRateScheduler(
partial(lr_cyclic_schedule,
lr_init = prms.lr,
**prms.lr_cyclic_schedule)
)
)
#########################################
model = get_model(n_classes=prms.n_classes,
final_activation=prms.final_activation,
ndense=prms.ndense,
dropout=prms.dropout,
base_trainable=prms.base_trainable,
weights = 'imagenet' if prms.pretrained else None,
input_shape = prms.target_size + [3])
#from keras.utils import plot_model
#plot_model(model, to_file='model.png')
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
metrics=['accuracy'],
)
#########################################
if prms.weightfile:
print("loading weights from:\t%s" % prms.weightfile)
model.load_weights(prms.weightfile)
#########################################
print('Using real-time data augmentation.')
flowfromdir_params = dict(
#color_mode = "grayscale",
target_size=prms.target_size,
batch_size=prms.batch_size,
class_mode=prms.class_mode,
classes=prms.classes,
seed=prms.seed)
norm_params = dict(
#rescale=prms.scaleup,
samplewise_center=prms.samplewise_center,
samplewise_std_normalization=prms.samplewise_center,
featurewise_center=False,
featurewise_std_normalization=False,
zca_whitening=False,
)
def _ztransform(x):
return (x-np.mean(x)) / np.std(x)
if 'preprocessing_function' in prms:
if prms.preprocessing_function=='ztransform':
preprocessing_function = _ztransform
elif prms.preprocessing_function=='m1p1':
preprocessing_function = lambda x: x/128.0 - 1
else:
raise ValueError("unknown preprocessing_function")
else:
preprocessing_function = lambda x: x
if prms.data_augmentation:
print('Using real-time data augmentation.')
train_datagen = ImageDataGenerator(
zoom_range=prms.zoom_range,
fill_mode=prms.fill_mode,
rotation_range = prms.rotation_range,
width_shift_range = prms.width_shift_range,
height_shift_range = prms.height_shift_range,
horizontal_flip=prms.horizontal_flip,
vertical_flip=prms.vertical_flip,
contrast = prms.contrast,
z_transform = prms.ztransform,
truncate_quantile = prms.truncate_quantile,
#histeq_alpha=prms.histeq_alpha,
**norm_params)
else:
train_datagen = ImageDataGenerator(**norm_params)
val_datagen = ImageDataGenerator(**norm_params)
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
stratify = prms.oversampling,
sampling_factor=prms.sampling_factor if (prms.oversampling) else None,
oversampling=prms.oversampling,
shuffle=True, **flowfromdir_params)
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
VALIDATION_STEPS = len(datagen_val_output.filenames)/prms['batch_size']
print("validation steps", VALIDATION_STEPS)
#########################################
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
model.fit_generator(datagen_train_output,
steps_per_epoch=STEPS_PER_EPOCH,
epochs=prms.nb_epoch, verbose=1,
validation_data=datagen_val_output,
validation_steps=VALIDATION_STEPS,
#class_weight='auto',
class_weight=class_weights,
callbacks=callback_list,
initial_epoch=prms.init_epoch)
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
print("""loss\t%.4f
accuracy\t%.4f\n""" %
tuple(model.evaluate_generator(datagen_val_output,
steps=VALIDATION_STEPS,
workers=1,
pickle_safe=True)))
#model.predict()
@@ -0,0 +1,48 @@
ReduceLROnPlateau:
cooldown: 8
epsilon: 0.001
factor: 0.5
min_lr: 1.0e-12
mode: auto
monitor: val_loss
patience: 64
verbose: 0
base_trainable: false
batch_size: 16
class_mode: categorical
class_weights:
- 1
- 1
classes:
- normal
- wire
data_augmentation: true
data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
dropout: 0.5
fill_mode: reflect
final_activation: softmax
height_shift_range: 0.125
horizontal_flip: true
init_epoch: 0
lr: 0.001
n_classes: 2
nb_epoch: 500
ndense: 0
oversampling: false
rescale: 1
rotation_range: 30
samplewise_center: false
seed: 1
target_side: 299
target_size:
- 299
- 299
truncate_quantile: null
vertical_flip: false
weightfile: null
width_shift_range: 0.125
zoom_range:
- 0.8
- 1.2
ztransform: true
@@ -0,0 +1,49 @@
ReduceLROnPlateau:
cooldown: 8
epsilon: 0.001
factor: 0.5
min_lr: 1.0e-12
mode: auto
monitor: val_loss
patience: 64
verbose: 0
base_trainable: false
batch_size: 16
class_mode: categorical
class_weights:
- 1
- 1
classes:
- normal
- wire
data_augmentation: true
data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
dropout: 0.5
fill_mode: reflect
final_activation: softmax
height_shift_range: 0.125
horizontal_flip: true
init_epoch: 0
lr: 0.001
n_classes: 2
nb_epoch: 500
ndense: 0
oversampling: false
rescale: 1
rotation_range: 30
samplewise_center: false
seed: 2
target_side: 299
target_size:
- 299
- 299
truncate_quantile: null
vertical_flip: false
weightfile: model.147-0.000774.hdf5
width_shift_range: 0.125
zoom_range:
- 0.8
- 1.2
ztransform: true
@@ -0,0 +1,315 @@
import sys
import pandas as pd
sys.path.append('../..')
from inception_short import get_model, get_num_files, get_class_weights
from keras.optimizers import Adam
from image import ImageDataGenerator
#from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from checkpoint_utils import CSVWallClockLogger
from shutil import copy2
from losses import acc_0, acc_1, acc_2, acc_3, acc_4
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
import sys
import os
import yaml
import numpy as np
import keras
from hashlib import md5
os.environ["PYTHONHASHSEED"]='0'
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
os.environ["CUDA_VISIBLE_DEVICES"]="0"
prms = AttrDict(
dropout=0.5,
base_trainable=False,
horizontal_flip = True,
vertical_flip = False,
zoom_range = [0.8, 1.2],
rotation_range = 30,
fill_mode='reflect',
ndense=0,
batch_size = 16,
init_epoch=0,
nb_epoch = 500,
data_augmentation = True,
rescale = 1, #2**-8,
#contrast = 0.9,
truncate_quantile = None,#0.001,
ztransform = True,
oversampling = False,
#sampling_factor = [1, 4],
seed=2,
width_shift_range = 0.125,
height_shift_range = 0.125,
class_mode = 'categorical', # 'binary', #
n_classes = 2,
final_activation = "softmax", # 'sigmoid',
lr = 1e-3,
samplewise_center = False, #True
target_side = 299,
#weights = None,
weightfile = "model.147-0.000774.hdf5",
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
classes = ["normal", "wire"],
class_weights=[1, 1],
ReduceLROnPlateau = dict(
monitor='val_loss',
factor=1/2,
patience=32*2,
verbose=0,
mode='auto', epsilon=0.001,
cooldown=8,
min_lr=1e-12,
),
)
paramhash = md5(str(prms).encode()).hexdigest()
prms["target_size"] = [ prms.target_side ]*2
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
# copy the script to the checkpoint directory
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
yaml.dump(dict(prms), outfh, default_flow_style=False)
# w_categorical_crossentropy
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
#########################################
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
save_best_only=True, save_weights_only=False, mode='auto', period=1)
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
callback_list = [checkpoint, csv_callback]
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
#########################################
model = get_model(n_classes=prms.n_classes,
final_activation=prms.final_activation,
ndense=prms.ndense,
#weights = prms.weights,
dropout=prms.dropout,
base_trainable=prms.base_trainable)
#from keras.utils import plot_model
#plot_model(model, to_file='model.png')
if __name__ == '__main__':
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
],
)
#########################################
if prms.weightfile:
print("loading weights from:\t%s" % prms.weightfile)
model.load_weights(prms.weightfile)
#########################################
print('Using real-time data augmentation.')
flowfromdir_params = dict(
#color_mode = "grayscale",
target_size=prms.target_size,
batch_size=prms.batch_size,
class_mode=prms.class_mode,
classes=prms.classes,
seed=prms.seed)
norm_params = dict(
rescale=prms.rescale,
samplewise_center=prms.samplewise_center,
samplewise_std_normalization=prms.samplewise_center,
featurewise_center=False,
featurewise_std_normalization=False,
zca_whitening=False,
z_transform = prms.ztransform,
)
def _ztransform(x):
return (x-np.mean(x)) / np.std(x)
if 'preprocessing_function' in prms:
if prms.preprocessing_function=='ztransform':
preprocessing_function = _ztransform
elif prms.preprocessing_function=='m1p1':
preprocessing_function = lambda x: x/128.0 - 1
else:
raise ValueError("unknown preprocessing_function")
else:
preprocessing_function = lambda x: x
if prms.data_augmentation:
print('Using real-time data augmentation.')
train_datagen = ImageDataGenerator(
zoom_range=prms.zoom_range,
fill_mode=prms.fill_mode,
rotation_range = prms.rotation_range,
width_shift_range = prms.width_shift_range,
height_shift_range = prms.height_shift_range,
horizontal_flip=prms.horizontal_flip,
vertical_flip=prms.vertical_flip,
contrast = prms.contrast if "contrast" in prms else None,
truncate_quantile = prms.truncate_quantile,
#histeq_alpha=prms.histeq_alpha,
**norm_params)
else:
train_datagen = ImageDataGenerator(**norm_params)
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
shuffle=False, **flowfromdir_params)
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
##########################################
# HOLDOUT
##########################################
val_datagen = ImageDataGenerator(**norm_params)
datagen_val_output = val_datagen.flow_from_directory(
prms.data_holdout, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_holdout.csv", index=False)
##########################################
# HOLDOUT FLIPPED
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
prms.data_holdout, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
#########################################
# VAL
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_test.csv", index=False)
#########################################
# VAL FLIPPED
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_test_fliplr.csv", index=False)
#########################################
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
yhat = model.predict_generator(datagen_train_output,
steps=STEPS_PER_EPOCH,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
##ipdb.set_trace()
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_train.csv", index=False)
#########################################
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
shuffle=False, **flowfromdir_params)
yhat = model.predict_generator(datagen_train_output,
steps=STEPS_PER_EPOCH,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
##ipdb.set_trace()
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_train_filplr.csv", index=False)
@@ -0,0 +1,50 @@
ReduceLROnPlateau:
cooldown: 8
epsilon: 0.001
factor: 0.5
min_lr: 1.0e-12
mode: auto
monitor: val_loss
patience: 64
verbose: 0
base_trainable: false
batch_size: 16
class_mode: categorical
class_weights:
- 1
- 1
classes:
- normal
- wire
data_augmentation: true
data_everything: /media/exx/tron/2017-07-png-jae/
data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
dropout: 0.5
fill_mode: reflect
final_activation: softmax
height_shift_range: 0.125
horizontal_flip: true
init_epoch: 0
lr: 0.001
n_classes: 2
nb_epoch: 500
ndense: 0
oversampling: false
rescale: 1
rotation_range: 30
samplewise_center: false
seed: 2
target_side: 299
target_size:
- 299
- 299
truncate_quantile: null
vertical_flip: false
weightfile: model.147-0.000774.hdf5
width_shift_range: 0.125
zoom_range:
- 0.8
- 1.2
ztransform: true
@@ -0,0 +1,398 @@
import sys
import pandas as pd
sys.path.append('../..')
sys.path.append("/data/dlituiev/kerastrainutils/")
from inception_short import get_model, get_num_files, get_class_weights
from keras.optimizers import Adam
from _image import ImageDataGenerator
#from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from checkpoint_utils import CSVWallClockLogger
from shutil import copy2
from losses import acc_0, acc_1, acc_2, acc_3, acc_4
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
import sys
import os
import yaml
import numpy as np
import keras
from hashlib import md5
os.environ["PYTHONHASHSEED"]='0'
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
os.environ["CUDA_VISIBLE_DEVICES"]="3"
prms = AttrDict(
dropout=0.5,
base_trainable=False,
horizontal_flip = True,
vertical_flip = False,
zoom_range = [0.8, 1.2],
rotation_range = 30,
fill_mode='reflect',
ndense=0,
batch_size = 16,
init_epoch=0,
nb_epoch = 500,
data_augmentation = True,
rescale = 1, #2**-8,
#contrast = 0.9,
truncate_quantile = None,#0.001,
ztransform = True,
oversampling = False,
#sampling_factor = [1, 4],
seed=2,
width_shift_range = 0.125,
height_shift_range = 0.125,
class_mode = 'categorical', # 'binary', #
n_classes = 2,
final_activation = "softmax", # 'sigmoid',
lr = 1e-3,
samplewise_center = False, #True
target_side = 299,
#weights = None,
weightfile = "model.147-0.000774.hdf5",
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
data_everything = "/media/exx/tron/2017-07-png-jae/",
classes = ["normal", "wire"],
class_weights=[1, 1],
ReduceLROnPlateau = dict(
monitor='val_loss',
factor=1/2,
patience=32*2,
verbose=0,
mode='auto', epsilon=0.001,
cooldown=8,
min_lr=1e-12,
),
)
paramhash = md5(str(prms).encode()).hexdigest()
prms["target_size"] = [ prms.target_side ]*2
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
# copy the script to the checkpoint directory
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
yaml.dump(dict(prms), outfh, default_flow_style=False)
# w_categorical_crossentropy
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
#########################################
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
save_best_only=True, save_weights_only=False, mode='auto', period=1)
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
callback_list = [checkpoint, csv_callback]
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
#########################################
model = get_model(n_classes=prms.n_classes,
final_activation=prms.final_activation,
ndense=prms.ndense,
#weights = prms.weights,
dropout=prms.dropout,
base_trainable=prms.base_trainable)
#from keras.utils import plot_model
#plot_model(model, to_file='model.png')
if __name__ == '__main__':
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
],
)
#########################################
if prms.weightfile:
print("loading weights from:\t%s" % prms.weightfile)
model.load_weights(prms.weightfile)
#########################################
print('Using real-time data augmentation.')
flowfromdir_params = dict(
#color_mode = "grayscale",
target_size=prms.target_size,
batch_size=prms.batch_size,
class_mode=prms.class_mode,
classes=prms.classes,
seed=prms.seed)
norm_params = dict(
rescale=prms.rescale,
samplewise_center=prms.samplewise_center,
samplewise_std_normalization=prms.samplewise_center,
featurewise_center=False,
featurewise_std_normalization=False,
zca_whitening=False,
z_transform = prms.ztransform,
)
def _ztransform(x):
return (x-np.mean(x)) / np.std(x)
if 'preprocessing_function' in prms:
if prms.preprocessing_function=='ztransform':
preprocessing_function = _ztransform
elif prms.preprocessing_function=='m1p1':
preprocessing_function = lambda x: x/128.0 - 1
else:
raise ValueError("unknown preprocessing_function")
else:
preprocessing_function = lambda x: x
if prms.data_augmentation:
print('Using real-time data augmentation.')
train_datagen = ImageDataGenerator(
zoom_range=prms.zoom_range,
fill_mode=prms.fill_mode,
rotation_range = prms.rotation_range,
width_shift_range = prms.width_shift_range,
height_shift_range = prms.height_shift_range,
horizontal_flip=prms.horizontal_flip,
vertical_flip=prms.vertical_flip,
#contrast = prms.contrast if "contrast" in prms else None,
#truncate_quantile = prms.truncate_quantile,
#histeq_alpha=prms.histeq_alpha,
**norm_params)
else:
train_datagen = ImageDataGenerator(**norm_params)
##########################################
# Everything
##########################################
val_datagen = ImageDataGenerator(**norm_params)
flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
##########################################
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything_fliplr.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything_flipud.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything_fliplrud.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
# DONE
##########################################
sys.exit(1)
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
shuffle=False, **flowfromdir_params)
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
##########################################
# HOLDOUT
##########################################
val_datagen = ImageDataGenerator(**norm_params)
datagen_val_output = val_datagen.flow_from_directory(
prms.data_holdout, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_holdout.csv", index=False)
##########################################
# HOLDOUT FLIPPED
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
prms.data_holdout, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
#########################################
# VAL
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_test.csv", index=False)
#########################################
# VAL FLIPPED
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_test_fliplr.csv", index=False)
#########################################
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
yhat = model.predict_generator(datagen_train_output,
steps=STEPS_PER_EPOCH,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
##ipdb.set_trace()
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_train.csv", index=False)
#########################################
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
shuffle=False, **flowfromdir_params)
yhat = model.predict_generator(datagen_train_output,
steps=STEPS_PER_EPOCH,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
##ipdb.set_trace()
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1 @@
../inception_short.py
@@ -0,0 +1,398 @@
import sys
import pandas as pd
sys.path.append('../..')
sys.path.append("/data/dlituiev/kerastrainutils/")
from inception_short import get_model, get_num_files, get_class_weights
from keras.optimizers import Adam
from _image import ImageDataGenerator
#from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from checkpoint_utils import CSVWallClockLogger
from shutil import copy2
from losses import acc_0, acc_1, acc_2, acc_3, acc_4
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
import sys
import os
import yaml
import numpy as np
import keras
from hashlib import md5
os.environ["PYTHONHASHSEED"]='0'
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
os.environ["CUDA_VISIBLE_DEVICES"]="3"
prms = AttrDict(
dropout=0.5,
base_trainable=False,
horizontal_flip = True,
vertical_flip = False,
zoom_range = [0.8, 1.2],
rotation_range = 30,
fill_mode='reflect',
ndense=0,
batch_size = 16,
init_epoch=0,
nb_epoch = 500,
data_augmentation = True,
rescale = 1, #2**-8,
#contrast = 0.9,
truncate_quantile = None,#0.001,
ztransform = True,
oversampling = False,
#sampling_factor = [1, 4],
seed=2,
width_shift_range = 0.125,
height_shift_range = 0.125,
class_mode = 'categorical', # 'binary', #
n_classes = 2,
final_activation = "softmax", # 'sigmoid',
lr = 1e-3,
samplewise_center = False, #True
target_side = 299,
#weights = None,
weightfile = "model.147-0.000774.hdf5",
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
data_everything = "/media/exx/tron/2017-07-png-jae/",
classes = ["normal", "wire"],
class_weights=[1, 1],
ReduceLROnPlateau = dict(
monitor='val_loss',
factor=1/2,
patience=32*2,
verbose=0,
mode='auto', epsilon=0.001,
cooldown=8,
min_lr=1e-12,
),
)
paramhash = md5(str(prms).encode()).hexdigest()
prms["target_size"] = [ prms.target_side ]*2
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
# copy the script to the checkpoint directory
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
yaml.dump(dict(prms), outfh, default_flow_style=False)
# w_categorical_crossentropy
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
#########################################
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
save_best_only=True, save_weights_only=False, mode='auto', period=1)
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
callback_list = [checkpoint, csv_callback]
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
#########################################
model = get_model(n_classes=prms.n_classes,
final_activation=prms.final_activation,
ndense=prms.ndense,
#weights = prms.weights,
dropout=prms.dropout,
base_trainable=prms.base_trainable)
#from keras.utils import plot_model
#plot_model(model, to_file='model.png')
if __name__ == '__main__':
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
],
)
#########################################
if prms.weightfile:
print("loading weights from:\t%s" % prms.weightfile)
model.load_weights(prms.weightfile)
#########################################
print('Using real-time data augmentation.')
flowfromdir_params = dict(
#color_mode = "grayscale",
target_size=prms.target_size,
batch_size=prms.batch_size,
class_mode=prms.class_mode,
classes=prms.classes,
seed=prms.seed)
norm_params = dict(
rescale=prms.rescale,
samplewise_center=prms.samplewise_center,
samplewise_std_normalization=prms.samplewise_center,
featurewise_center=False,
featurewise_std_normalization=False,
zca_whitening=False,
z_transform = prms.ztransform,
)
def _ztransform(x):
return (x-np.mean(x)) / np.std(x)
if 'preprocessing_function' in prms:
if prms.preprocessing_function=='ztransform':
preprocessing_function = _ztransform
elif prms.preprocessing_function=='m1p1':
preprocessing_function = lambda x: x/128.0 - 1
else:
raise ValueError("unknown preprocessing_function")
else:
preprocessing_function = lambda x: x
if prms.data_augmentation:
print('Using real-time data augmentation.')
train_datagen = ImageDataGenerator(
zoom_range=prms.zoom_range,
fill_mode=prms.fill_mode,
rotation_range = prms.rotation_range,
width_shift_range = prms.width_shift_range,
height_shift_range = prms.height_shift_range,
horizontal_flip=prms.horizontal_flip,
vertical_flip=prms.vertical_flip,
#contrast = prms.contrast if "contrast" in prms else None,
#truncate_quantile = prms.truncate_quantile,
#histeq_alpha=prms.histeq_alpha,
**norm_params)
else:
train_datagen = ImageDataGenerator(**norm_params)
##########################################
# Everything
##########################################
val_datagen = ImageDataGenerator(**norm_params)
flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
##########################################
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything_fliplr.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything_flipud.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
os.path.dirname(prms.data_everything.rstrip('/')),
shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = len(datagen_val_output)
pred_fn = "predictions_everything_fliplrud.csv"
with open(pred_fn, 'w+') as fh:
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
for ii, batch in enumerate(datagen_val_output):
if ii> VALIDATION_STEPS:
break
yhat = model.predict_on_batch(batch[0])
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
for fnimg, yhat_ in zip(filenames, yhat):
print(fnimg, *yhat_, sep=',', file = fh)
##########################################
# DONE
##########################################
sys.exit(1)
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
shuffle=False, **flowfromdir_params)
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
##########################################
# HOLDOUT
##########################################
val_datagen = ImageDataGenerator(**norm_params)
datagen_val_output = val_datagen.flow_from_directory(
prms.data_holdout, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_holdout.csv", index=False)
##########################################
# HOLDOUT FLIPPED
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
prms.data_holdout, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
#########################################
# VAL
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_test.csv", index=False)
#########################################
# VAL FLIPPED
##########################################
val_datagen = ImageDataGenerator(**norm_params, )
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
print("validation steps", VALIDATION_STEPS)
yhat = model.predict_generator(datagen_val_output,
steps=VALIDATION_STEPS,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_test_fliplr.csv", index=False)
#########################################
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
yhat = model.predict_generator(datagen_train_output,
steps=STEPS_PER_EPOCH,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
##ipdb.set_trace()
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_train.csv", index=False)
#########################################
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
shuffle=False, **flowfromdir_params)
yhat = model.predict_generator(datagen_train_output,
steps=STEPS_PER_EPOCH,
verbose=1,)
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
##ipdb.set_trace()
dfres = pd.DataFrame(dfdict)
dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1,224 @@
from inception_short import get_model, get_num_files, get_class_weights
from keras.optimizers import Adam
from image import ImageDataGenerator
#from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from checkpoint_utils import CSVWallClockLogger
from shutil import copy2
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
import sys
import os
import yaml
import numpy as np
import keras
from hashlib import md5
os.environ["PYTHONHASHSEED"]='0'
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
os.environ["CUDA_VISIBLE_DEVICES"]="1"
prms = AttrDict(
dropout=0.5,
base_trainable=False,
horizontal_flip = True,
vertical_flip = False,
zoom_range = [0.8, 1.2],
rotation_range = 30,
fill_mode='reflect',
ndense=0,
batch_size = 16,
init_epoch=0,
nb_epoch = 500,
data_augmentation = True,
rescale = 1, #2**-8,
#contrast = 0.9,
truncate_quantile = None,#0.001,
ztransform = True,
oversampling = False,
#sampling_factor = [1, 4],
seed=1,
width_shift_range = 0.125,
height_shift_range = 0.125,
class_mode = 'categorical', # 'binary', #
n_classes = 2,
final_activation = "softmax", # 'sigmoid',
lr = 1e-3,
samplewise_center = False, #True
target_side = 299,
#weights = None,
weightfile = None, #"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.31-1.290145.hdf5",
#"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.59-1.676424.hdf5",
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
classes = ["normal", "wire"],
class_weights=[1, 1],
ReduceLROnPlateau = dict(
monitor='val_loss',
factor=1/2,
patience=32*2,
verbose=0,
mode='auto', epsilon=0.001,
cooldown=8,
min_lr=1e-12,
),
)
paramhash = md5(str(prms).encode()).hexdigest()
prms["target_size"] = [ prms.target_side ]*2
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
# copy the script to the checkpoint directory
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
yaml.dump(dict(prms), outfh, default_flow_style=False)
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
print('='*50)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
#########################################
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
save_best_only=True, save_weights_only=False, mode='auto', period=1)
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
callback_list = [checkpoint, csv_callback]
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
#########################################
model = get_model(n_classes=prms.n_classes,
final_activation=prms.final_activation,
ndense=prms.ndense,
#weights = prms.weights,
dropout=prms.dropout,
base_trainable=prms.base_trainable)
#from keras.utils import plot_model
#plot_model(model, to_file='model.png')
if __name__ == '__main__':
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
metrics=['accuracy', #acc_0, acc_1,# acc_2, acc_3, acc_4
],
)
#########################################
if prms.weightfile:
print("loading weights from:\t%s" % prms.weightfile)
model.load_weights(prms.weightfile)
#########################################
print('Using real-time data augmentation.')
flowfromdir_params = dict(
#color_mode = "grayscale",
target_size=prms.target_size,
batch_size=prms.batch_size,
class_mode=prms.class_mode,
classes=prms.classes,
seed=prms.seed)
norm_params = dict(
rescale=prms.rescale,
samplewise_center=prms.samplewise_center,
samplewise_std_normalization=prms.samplewise_center,
featurewise_center=False,
featurewise_std_normalization=False,
zca_whitening=False,
z_transform = prms.ztransform,
)
def _ztransform(x):
return (x-np.mean(x)) / np.std(x)
if 'preprocessing_function' in prms:
if prms.preprocessing_function=='ztransform':
preprocessing_function = _ztransform
elif prms.preprocessing_function=='m1p1':
preprocessing_function = lambda x: x/128.0 - 1
else:
raise ValueError("unknown preprocessing_function")
else:
preprocessing_function = lambda x: x
if prms.data_augmentation:
print('Using real-time data augmentation.')
train_datagen = ImageDataGenerator(
zoom_range=prms.zoom_range,
fill_mode=prms.fill_mode,
rotation_range = prms.rotation_range,
width_shift_range = prms.width_shift_range,
height_shift_range = prms.height_shift_range,
horizontal_flip=prms.horizontal_flip,
vertical_flip=prms.vertical_flip,
contrast = prms.contrast if "contrast" in prms else None,
truncate_quantile = prms.truncate_quantile,
#histeq_alpha=prms.histeq_alpha,
**norm_params)
else:
train_datagen = ImageDataGenerator(**norm_params)
val_datagen = ImageDataGenerator(**norm_params)
datagen_train_output = train_datagen.flow_from_directory(
prms.data_train,
stratify = prms.oversampling,
sampling_factor=prms.sampling_factor if prms.oversampling else None,
oversampling=prms.oversampling,
shuffle=True, **flowfromdir_params)
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
VALIDATION_STEPS = np.ceil(len(datagen_val_output.filenames)/prms['batch_size'])
print("validation steps", VALIDATION_STEPS)
#########################################
if prms.class_weights == 'auto':
class_weights = get_class_weights(datagen_val_output)
else:
class_weights = prms.class_weights
model.fit_generator(datagen_train_output,
steps_per_epoch=STEPS_PER_EPOCH,
epochs=prms.nb_epoch, verbose=1,
validation_data=datagen_val_output,
validation_steps=VALIDATION_STEPS,
#class_weight='auto',
class_weight=class_weights,
callbacks=callback_list,
initial_epoch=prms.init_epoch)
datagen_val_output = val_datagen.flow_from_directory(
prms.data_val, shuffle=False, **flowfromdir_params)
print("""loss\t%.4f
accuracy\t%.4f\n""" %
tuple(model.evaluate_generator(datagen_val_output,
steps=VALIDATION_STEPS,
workers=1,
pickle_safe=True)))
#model.predict()
File diff suppressed because it is too large Load Diff
+245
View File
@@ -0,0 +1,245 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 9 11:00:55 2017
@author: dlituiev
"""
import os
from collections import Counter
from functools import partial
from itertools import product
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D, GaussianNoise, Input
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Dropout, Activation, Flatten, Lambda, BatchNormalization, Input
from keras.optimizers import Adam
#########################################
def get_num_files(parentdir):
numfiles = 0
for dd in os.scandir(parentdir):
dd = os.path.join(parentdir, dd)
if os.path.isdir(dd):
numfiles+= sum((1 for ff in os.scandir(dd)))
return numfiles
#########################################
#########################################
# SET UP THE NETWORK
#########################################
def get_model(n_classes, final_activation,
ndense=512, dropout=0.5,
weights='imagenet',
input_shape = [None, None, 3],
gaussian_noise_sigma = None,
input_tensor = None,
base_trainable=False):
if input_shape:
input_tensor = Input(shape = input_shape)
if gaussian_noise_sigma is not None:
input_tensor = GaussianNoise(gaussian_noise_sigma)(input_tensor)
# create the base pre-trained model
base_model = InceptionV3(weights=weights, include_top=False,
input_tensor = input_tensor,
)
# get third Concatenation layer and crop the network on it:
cc=0
poptherest = False
for nn, la in enumerate(base_model.layers):
if type(la) is keras.layers.Concatenate:
if cc==3:
x = la.output
break
cc+=1
base_model.layers = base_model.layers[:nn+1]
#x = [la.output for la in base_model.layers if type(la) is keras.layers.Concatenate][3]
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dropout(dropout)(x)
if ndense>0:
x = Dense(ndense, activation='relu')(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(n_classes, activation=final_activation)(x)
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)
# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
if not base_trainable:
for layer in base_model.layers:
layer.trainable = False
last_module_index = [nn for nn,la in enumerate(model.layers) if type(la) is keras.layers.Concatenate][-2]
for layer in model.layers[last_module_index:]:
layer.trainable = True
return model
def get_class_weights(datagen_val_output):
counter = Counter(datagen_val_output.classes)
print("distribution of labels in {}:\n{}".format(datagen_val_output.directory, str(counter)))
for kk,vv in counter.items():
counter[kk] = vv+1
max_val = float(max(counter.values()))
class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}
return class_weights
def w_categorical_crossentropy(weights):
def _w_categorical_crossentropy(y_true, y_pred, weights):
nb_cl = len(weights)
final_mask = K.zeros_like(y_pred[:, 0])
y_pred_max = K.max(y_pred, axis=1)
y_pred_max = K.expand_dims(y_pred_max, 1)
y_pred_max_mat = K.equal(y_pred, y_pred_max)
for c_p, c_t in product(range(nb_cl), range(nb_cl)):
final_mask += (K.cast(weights[c_t, c_p],K.floatx()) *
K.cast(y_pred_max_mat[:, c_p] ,K.floatx()) *
K.cast(y_true[:, c_t],K.floatx())
)
return K.categorical_crossentropy(y_pred, y_true) * final_mask
ncce = partial(_w_categorical_crossentropy, weights=weights)
ncce.__name__ ='w_categorical_crossentropy'
return ncce
if __name__ == '__main__':
import numpy as np
import keras
#csv_path = CHECKPOINTS_BASE + ".log.csv"
#csv_callback = keras.callbacks.CSVLogger(csv_path, separator=',', append=False)
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
os.environ["CUDA_VISIBLE_DEVICES"] = '2'
NDENSE=256 #512
BATCH_SIZE = 128
NB_EPOCH = 20
DATA_AUGMENTATION = True
SEED=0
CLASS_MODE = 'binary' # 'categorical'
LOSS = '{}_crossentropy'.format(CLASS_MODE)
N_CLASSES = 1
FINAL_ACTIVATION = 'sigmoid'
LR = 0.0001
SAMPLEWISE_CENTER = False #True
TARGET_SIDE = 99
TARGET_SIZE = [TARGET_SIDE]*2
BASE_TRAINABLE=False
CHECKPOINT_DIR = "./modelstate_withx_negloglr{:d}_ndense{:d}_imsize{:d}{}/" .format(
int(-np.log10(LR)),
NDENSE,
TARGET_SIDE,
"" if not BASE_TRAINABLE else "_base_trainable"
)
CHECKPOINT_PATH = CHECKPOINT_DIR + 'model.{epoch:02d}-{val_loss:2f}.hdf5'
WEIGHTFILE = None # "./modelstate_withx_negloglr4_ndense256/model.39-0.060567.hdf5" # None # "./modelstate_withx/model.03-0.067136.hdf5"
# "modelstate_laplace_inv_weights_2/model.10-0.014968.hdf5" #CHECKPOINT_DIR + "model.10-0.019602.hdf5"
INIT_EPOCH=0
# indir = "/data/dlituiev/learn_spotmag_from_images/modelstate/"
# find_min_loss_checkpoint(indir)
DATA_TRAIN = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_train/'
DATA_VAL = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_test/'
SAMPLES_PER_EPOCH = get_num_files(DATA_TRAIN)
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // BATCH_SIZE
CLASSES = ["normal", "special"]
VALIDATION_STEPS = get_num_files(DATA_VAL) // BATCH_SIZE
print('='*50)
print("validation steps", VALIDATION_STEPS)
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
print('='*50)
#########################################
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
save_best_only=False, save_weights_only=False, mode='auto', period=1)
callbacks_list =[checkpoint]
#########################################
model = get_model(n_classes=N_CLASSES,
final_activation=FINAL_ACTIVATION,
ndense=NDENSE,
dropout=0.5,
base_trainable=BASE_TRAINABLE)
#from keras.utils import plot_model
#plot_model(model, to_file='model.png')
model.compile(optimizer=Adam(lr=LR), loss=LOSS, metrics=['accuracy'],
callbacks = [csv_callback])
#########################################
if WEIGHTFILE:
print("loading weights from:\t%s" % WEIGHTFILE)
model.load_weights(WEIGHTFILE)
print('Using real-time data augmentation.')
flowfromdir_params = dict(
#color_mode = "grayscale",
target_size=TARGET_SIZE,
batch_size=BATCH_SIZE,
class_mode=CLASS_MODE,
classes=CLASSES,
seed=SEED)
train_datagen = ImageDataGenerator(
samplewise_center=SAMPLEWISE_CENTER,
samplewise_std_normalization=SAMPLEWISE_CENTER,
featurewise_center=False,
featurewise_std_normalization=False,
zca_whitening=False,
rotation_range=10,
width_shift_range=0.125,
height_shift_range=0.125,
horizontal_flip=True,
vertical_flip=False)
val_datagen = ImageDataGenerator()
datagen_train_output = train_datagen.flow_from_directory(
DATA_TRAIN, shuffle=True, **flowfromdir_params)
datagen_val_output = val_datagen.flow_from_directory(
DATA_VAL, shuffle=False, **flowfromdir_params)
class_weights = get_class_weights(datagen_val_output)
model.fit_generator(datagen_train_output,
steps_per_epoch=STEPS_PER_EPOCH,
epochs=NB_EPOCH, verbose=1,
validation_data=datagen_val_output,
validation_steps=VALIDATION_STEPS,
#class_weight='auto',
class_weight=class_weights,
callbacks=callbacks_list,
initial_epoch=INIT_EPOCH)
#model.predict()
File diff suppressed because one or more lines are too long
+23
View File
@@ -0,0 +1,23 @@
Cython==0.27.3
h5py==2.7.0
imgaug==0.2.5
Keras==2.0.8
-e git+https://github.com/raghakot/keras-vis@40b27dfa3ecb84cdde5ec6b44251923c3266cc40#egg=keras_vis
lime==0.1.1.29
matplotlib==2.0.2
mudicom==0.1.2
numpy==1.14.0
opencv-python==3.3.0.10
pandas==0.20.2
Pillow==4.1.1
pyaml==17.7.2
-e git+https://github.com/cocodataset/cocoapi/@727b546dd9fa4e4bb113213c98a3925829fac0bf#egg=pycocotools&subdirectory=PythonAPI
pydicom==0.9.9
PyYAML==3.12
scikit-image==0.13.0
scikit-learn==0.18.1
scipy==0.19.1
seaborn==0.7.1
sklearn==0.0
tensorflow-gpu==1.4.1
tensorflow-tensorboard==0.4.0rc3
File diff suppressed because one or more lines are too long