mirror of
https://github.com/wassname/mammoviews.git
synced 2026-06-27 16:10:25 +08:00
initial
This commit is contained in:
@@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
**/*.hdf5
|
||||||
|
**/*.csv
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
# Code for automatic labeling of special diagnostic mammography views from images and DICOM headers
|
||||||
|
|
||||||
|
## DICOM
|
||||||
|
### Extract selected fields from DICOM headers
|
||||||
|
|
||||||
|
dicom_header_extraction/extract_dicom_headers_w_generator_150K.py
|
||||||
|
|
||||||
|
### Normalize / expand data
|
||||||
|
|
||||||
|
dicom_header_extraction/normalize_selected_dcm_headers.py
|
||||||
|
|
||||||
|
### Machine learning on DICOM headers
|
||||||
|
|
||||||
|
caret_on_headers.R # most methods
|
||||||
|
caret_on_headers_nona.R # GLMNET
|
||||||
|
|
||||||
|
## Image pipeline
|
||||||
|
|
||||||
|
### General image model
|
||||||
|
- scripts and config files: `image_classifiers/e5ce2d69b035975cb5336cec0da9a32a`
|
||||||
|
|
||||||
|
- weight files:
|
||||||
|
|
||||||
|
### Wire localization model
|
||||||
|
|
||||||
|
- scripts and config files: `image_classifiers/e8e71fc090141d7c6fb334359152d295`
|
||||||
|
|
||||||
|
- weight files:
|
||||||
|
|
||||||
|
|
||||||
|
## Visualization of performance metrics
|
||||||
|
Scripts used to generate Fig. 1
|
||||||
|
|
||||||
|
combine_predictions_hdr_and_img.ipynb
|
||||||
|
visualize_predictions_hdr_and_img.ipynb
|
||||||
|
|
||||||
|
|
||||||
|
## Significance tests
|
||||||
|
Scripts used to generate Supplementary Figures S1 & S2
|
||||||
|
|
||||||
|
calc_auroc_confidence_intervals.R
|
||||||
|
plot_auroc_difference_pvalue.ipynb
|
||||||
@@ -0,0 +1,169 @@
|
|||||||
|
rm(list=ls())
|
||||||
|
library(pROC)
|
||||||
|
library(ggplot2)
|
||||||
|
library(ggsignif)
|
||||||
|
library(dplyr)
|
||||||
|
library(data.table)
|
||||||
|
read.gz <- function(filename, ...){
|
||||||
|
as.data.frame(fread(paste("zcat < ",filename),
|
||||||
|
header=TRUE, fill = TRUE, ...))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
tag <- "e5ce2d69b035975cb5336cec0da9a32a"
|
||||||
|
fnall <- "../tables/all_predictions_with_images.tab"
|
||||||
|
fnall <- paste0("../tables/all_predictions_with_images-", tag,".tab")
|
||||||
|
|
||||||
|
predictions <- as.data.frame(fread(fnall, sep='\t'), header=TRUE, fill = TRUE)
|
||||||
|
|
||||||
|
labelled <- sapply(predictions$label, function(x) nchar(x)>0)
|
||||||
|
|
||||||
|
print(nrow(predictions[labelled,]))
|
||||||
|
predictions <- predictions[labelled,]
|
||||||
|
|
||||||
|
|
||||||
|
predictions[,'ViewModifier'] <- as.numeric(predictions[,'ViewModifier']!='')
|
||||||
|
|
||||||
|
predictions[, "label"] <- factor(predictions[, "label"], c('normal', 'special'))
|
||||||
|
|
||||||
|
predictions[,"view"] <- factor(predictions[,"view"], c('N','M','T','W','X'))
|
||||||
|
head(predictions)
|
||||||
|
# holdout <- predictions[predictions$set == 'val',]
|
||||||
|
|
||||||
|
ggplot(holdout, aes(view, `score_max_wire_image+gbmt`)) + geom_point()
|
||||||
|
|
||||||
|
validation <- predictions[predictions$set == 'test',]
|
||||||
|
|
||||||
|
clmns <- colnames(predictions)
|
||||||
|
|
||||||
|
othercols <- c('id', 'set', 'view', 'label')
|
||||||
|
modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
|
||||||
|
'image',
|
||||||
|
'image_max',
|
||||||
|
'wire',
|
||||||
|
'wire_max',
|
||||||
|
'max_image_wire_max',
|
||||||
|
'image+gbmt',
|
||||||
|
'max_wire_max_image+gbmt',
|
||||||
|
'max_image_wire',
|
||||||
|
'max_wire_image+gbmt')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
clean_score_names <- function(x){
|
||||||
|
return( gsub('score_', '', x) )
|
||||||
|
# paste(strsplit(x, '_')[[1]][-1],collapse='_')
|
||||||
|
}
|
||||||
|
|
||||||
|
clmns_clean <- vapply(clmns, clean_score_names, '')
|
||||||
|
|
||||||
|
cols_ <- factor(vapply(colnames(predictions) , clean_score_names, ''),
|
||||||
|
c(othercols,modelnames))
|
||||||
|
|
||||||
|
colnames(validation) <- cols_
|
||||||
|
|
||||||
|
validation <- validation[,!is.na(colnames(validation))]
|
||||||
|
|
||||||
|
cols_ <- cols_[!is.na(cols_)]
|
||||||
|
cols_ <- cols_[order(cols_)]
|
||||||
|
|
||||||
|
validation <- validation[,as.character(cols_)]
|
||||||
|
|
||||||
|
colnames(validation)
|
||||||
|
# clmns <-clmns[vapply(clmns, function(x) strsplit(x, '_')[[1]][1]=='score', TRUE)]
|
||||||
|
|
||||||
|
## Perform McNemars test for prediction difference ----------------------------------------------------
|
||||||
|
|
||||||
|
mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$max_image_wire_max>0.5))
|
||||||
|
|
||||||
|
mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$gbmt>0.5))
|
||||||
|
|
||||||
|
## Calculate significance of pairwise auROC differences -----------------------------------------------
|
||||||
|
cis <- list()
|
||||||
|
rocobjects <- list()
|
||||||
|
ii <- 0
|
||||||
|
for (clmn in modelnames){
|
||||||
|
# ii = 1
|
||||||
|
print('====================')
|
||||||
|
print(clmn)
|
||||||
|
rocobj <- plot.roc( validation[, "label"],
|
||||||
|
validation[,clmn],
|
||||||
|
levels = (levels(validation[, "label"])),
|
||||||
|
xlim = c(100,0),
|
||||||
|
ylim = c(0,100),
|
||||||
|
percent=TRUE,
|
||||||
|
print.auc=TRUE)
|
||||||
|
rocobjects[[clmn]] <- rocobj
|
||||||
|
cis[[clmn]] <- ci(rocobj, of="auc", thresholds="best")
|
||||||
|
}
|
||||||
|
|
||||||
|
## Wire model on wire cases
|
||||||
|
for (clmn in c('wire', 'wire_max')){
|
||||||
|
print('====================')
|
||||||
|
print(clmn)
|
||||||
|
rocobj <- plot.roc( validation[, "view"]=='W',
|
||||||
|
validation[,clmn],
|
||||||
|
# levels = (levels(validation[, "label"])),
|
||||||
|
xlim = c(100,0),
|
||||||
|
ylim = c(0,100),
|
||||||
|
percent=TRUE,
|
||||||
|
print.auc=TRUE)
|
||||||
|
rocobjects[[clmn]] <- rocobj
|
||||||
|
cis[[paste0(clmn, ' (vs other views)')]] <- ci(rocobj, of="auc", thresholds="best")
|
||||||
|
}
|
||||||
|
###
|
||||||
|
modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
|
||||||
|
'image', "image_max",
|
||||||
|
'wire', 'wire_max',
|
||||||
|
'wire (vs other views)', 'wire_max (vs other views)',
|
||||||
|
'max_image_wire_max',
|
||||||
|
'image+gbmt',
|
||||||
|
'max_wire_max_image+gbmt')
|
||||||
|
|
||||||
|
##
|
||||||
|
|
||||||
|
dfcis <- as.data.frame(t(do.call(cbind.data.frame, lapply(cis, as.vector))))
|
||||||
|
colnames(dfcis) <- c('lower', 'auROC', 'upper')
|
||||||
|
|
||||||
|
dfcis[,"model"] <- factor(rownames(dfcis),
|
||||||
|
modelnames)
|
||||||
|
|
||||||
|
dfcis <- dfcis[!is.na(dfcis[,"model"]),]
|
||||||
|
|
||||||
|
rownames(dfcis) <- dfcis[,"model"]
|
||||||
|
|
||||||
|
dfcis <- dfcis[modelnames,]
|
||||||
|
|
||||||
|
|
||||||
|
# dfcis <-dfcis %>% mutate(model = factor(model, levels=rev(levels(model))))
|
||||||
|
dfcis_nowire <- dfcis[!(rownames(dfcis) %in% c('wire','wire_max')),]
|
||||||
|
dfcis_nowire$model <- factor(dfcis_nowire$model)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# annotation_df <- data.frame(color=c("E", "H"),
|
||||||
|
# start=c("Good", "Fair"),
|
||||||
|
# end=c("Very Good", "Good"),
|
||||||
|
# y=c(3.6, 4.7),
|
||||||
|
# label=c("Comp. 1", "Comp. 2"))
|
||||||
|
|
||||||
|
roc.test(rocobjects[["ViewModifier"]], rocobjects[["gbmt"]])
|
||||||
|
|
||||||
|
## Format Pairwise comparisons
|
||||||
|
|
||||||
|
keys <- names(rocobjects)
|
||||||
|
dfcompar <- data.frame()
|
||||||
|
for (a in 1:length(rocobjects)){
|
||||||
|
for (b in 1:a){
|
||||||
|
na <- keys[a]
|
||||||
|
nb <- keys[b]
|
||||||
|
if ((as.numeric(rocobjects[[na]]$auc)==100)||(as.numeric(rocobjects[[nb]]$auc)==100)){
|
||||||
|
dfcompar[na, nb] <- NA
|
||||||
|
} else {
|
||||||
|
dfcompar[na, nb] <- roc.test(rocobjects[[na]], rocobjects[[nb]], method='delong')$p.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn.comparison <- paste0("../tables/auroc_delong_comparison-", tag,".csv")
|
||||||
|
write.csv(dfcompar, file=fn.comparison)
|
||||||
@@ -0,0 +1,284 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
rm(list=ls())
|
||||||
|
|
||||||
|
library(caret)
|
||||||
|
library(gbm3)
|
||||||
|
library(data.table)
|
||||||
|
library(ggplot2)
|
||||||
|
library(fastmatch)
|
||||||
|
|
||||||
|
read.gz <- function(filename, ...){
|
||||||
|
as.data.frame(fread(paste("zcat < ",filename),
|
||||||
|
header=TRUE, fill = TRUE, ...))
|
||||||
|
}
|
||||||
|
|
||||||
|
TABLEDIR = "../tables/"
|
||||||
|
fn_ids = paste(TABLEDIR,
|
||||||
|
"2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz", sep='/')
|
||||||
|
|
||||||
|
ids = read.gz(fn_ids, select="id")$id
|
||||||
|
|
||||||
|
fn_features = paste(TABLEDIR, "mammo_dicom_headers/df_all_mammos_dicom_headers_selected_expanded.tab.gz", sep='/')
|
||||||
|
dffeatures = read.gz(fn_features, sep='\t')
|
||||||
|
print(nrow(dffeatures))
|
||||||
|
print(length(ids))
|
||||||
|
|
||||||
|
dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
|
||||||
|
dffeatures <- dffeatures[!is.na(dffeatures$filename),]
|
||||||
|
rm(ids)
|
||||||
|
|
||||||
|
# Data formatting -----------------------------------------
|
||||||
|
|
||||||
|
collist = c("BodyPartThickness", "XRayTubeCurrentInuA", "ContentTime",
|
||||||
|
"DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
|
||||||
|
for (cc in collist){
|
||||||
|
dffeatures[,cc] <- as.numeric(dffeatures[,cc])
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
dtypes = sapply(dffeatures, class)
|
||||||
|
names(dtypes[dtypes == 'character'])
|
||||||
|
|
||||||
|
|
||||||
|
row.names(dffeatures) = dffeatures$filename
|
||||||
|
excludeCols <- c("filename",
|
||||||
|
"CollimatorLeftVerticalEdge",
|
||||||
|
"CollimatorLowerHorizontalEdge",
|
||||||
|
"DistanceSourceToEntrance",
|
||||||
|
"ExposuresOnDetectorSinceLastCalibration",
|
||||||
|
"ExposuresOnDetectorSinceManufactured",
|
||||||
|
"ShutterLowerHorizontalEdge",
|
||||||
|
"ShutterRightVerticalEdge",
|
||||||
|
"XRayTubeCurrentInuA"
|
||||||
|
# "ManufacturerModelName"
|
||||||
|
)
|
||||||
|
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
|
||||||
|
|
||||||
|
|
||||||
|
catcols <- c('ViewModifierCodeMeaning',
|
||||||
|
'ViewCodeValue',
|
||||||
|
'DetectorActiveDimensionsMissing',
|
||||||
|
'FieldOfViewOriginMissing',
|
||||||
|
'Grid',
|
||||||
|
'Manufacturer',
|
||||||
|
'ManufacturerModelName')
|
||||||
|
|
||||||
|
for (cc in catcols){
|
||||||
|
dffeatures[,cc] = as.factor(dffeatures[,cc])
|
||||||
|
}
|
||||||
|
#cell#
|
||||||
|
|
||||||
|
colSums(sapply(dffeatures, is.na))
|
||||||
|
|
||||||
|
# Read labels --------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
fn.labelledset = paste(TABLEDIR, "spotmag_predictions/train_test_split-2018-02-15-within7e5.csv", sep='/')
|
||||||
|
# filelist.labelled = read.table(fn.labelledset, )
|
||||||
|
df.labelled = as.data.frame(fread(fn.labelledset))
|
||||||
|
rownames(df.labelled) <- df.labelled$id
|
||||||
|
vec.labelled = df.labelled$id
|
||||||
|
df.labelled$label <- as.factor(df.labelled$label)
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
|
||||||
|
vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
|
||||||
|
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
|
||||||
|
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
dffeatures.labelled <- dffeatures[vec.labelled,]
|
||||||
|
dffeatures.labelled$label <- df.labelled$label
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
|
||||||
|
dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
|
||||||
|
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
|
||||||
|
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
|
||||||
|
|
||||||
|
colnames(dffeatures.labelled.tr_set)
|
||||||
|
|
||||||
|
|
||||||
|
for (cc in colnames(dffeatures.labelled.tr_set)){
|
||||||
|
if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
|
||||||
|
setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
|
||||||
|
if (length(setdiff_)>0){
|
||||||
|
print(cc)
|
||||||
|
print(setdiff_)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# GBM3 ----------------------------------------
|
||||||
|
|
||||||
|
par_detail <- gbmParallel(num_threads = 4) # Pass to par_details in gbmt
|
||||||
|
gbmt_fit <- gbmt(label ~ .,
|
||||||
|
data = dffeatures.labelled.tr_set,
|
||||||
|
cv_folds = 10,
|
||||||
|
# training_params = training_params(num_trees = 100,
|
||||||
|
# interaction_depth = 1,
|
||||||
|
# min_num_obs_in_node = 10,
|
||||||
|
# shrinkage = 0.005,
|
||||||
|
# bag_fraction = 0.5,
|
||||||
|
# num_features = 2),
|
||||||
|
keep_gbm_data = TRUE,
|
||||||
|
par_detail=par_detail)
|
||||||
|
|
||||||
|
best_iter_cv <- gbmt_performance(gbmt_fit, method='cv')
|
||||||
|
plot(best_iter_cv)
|
||||||
|
|
||||||
|
best.iter.oob <- gbmt_performance(gbmt_fit,method="OOB") # returns out-of-bag estimated best number of trees
|
||||||
|
plot(best.iter.oob)
|
||||||
|
|
||||||
|
saveRDS(gbmt_fit, sprintf("gbm3_ntrees_%d_%s.rds", best_iter_cv, Sys.Date()))
|
||||||
|
|
||||||
|
## Feature Importance Plotting ----------------
|
||||||
|
|
||||||
|
infl_gbmt <- (as.data.frame(relative_influence(gbmt_fit, best_iter_cv, rescale=T)))
|
||||||
|
colnames(infl_gbmt) <- "relative influence"
|
||||||
|
infl_gbmt[,"variable"] <- rownames(infl_gbmt)
|
||||||
|
|
||||||
|
infl_gbmt = infl_gbmt[infl_gbmt$`relative influence` >0,]
|
||||||
|
|
||||||
|
plimp <- ggplot(data=infl_gbmt) +
|
||||||
|
geom_segment(size=5, colour='blue') +
|
||||||
|
aes(x=reorder(variable,`relative influence`),
|
||||||
|
xend = variable,
|
||||||
|
y = 2e-6,
|
||||||
|
yend=`relative influence`,
|
||||||
|
label=`relative influence`) +
|
||||||
|
scale_y_log10() +
|
||||||
|
# coord_cartesian(ylim= c(0.8e-6, 1.05)) +
|
||||||
|
ylab("relative influence") + xlab("") +
|
||||||
|
coord_flip() +
|
||||||
|
theme(axis.text.y = element_text(colour="black",size=16,angle=0,face="plain"),
|
||||||
|
axis.text.x = element_text(colour="black",size=16,angle=0,face="plain"),
|
||||||
|
axis.title.x = element_text(colour="black",size=16,angle=0,face="plain"),
|
||||||
|
# panel.background = element_rect(fill = "transparent"), # bg of the panel
|
||||||
|
#plot.background = element_rect(fill = "transparent"), # bg of the plot
|
||||||
|
# panel.grid.major = element_blank(), # get rid of major grid
|
||||||
|
# , panel.grid.minor = element_blank() # get rid of minor grid
|
||||||
|
, legend.background = element_rect(fill = "transparent") # get rid of legend bg
|
||||||
|
, legend.box.background = element_rect(fill = "transparent") # get rid of legend panel bg
|
||||||
|
)
|
||||||
|
|
||||||
|
plimp + coord_trans(limy= c(0.5e-6, 1.05)) + coord_flip()
|
||||||
|
|
||||||
|
plimp + ggsave("img/xgbt_importances.eps", device = 'eps', bg = "transparent",
|
||||||
|
width = 8, height = 6, dpi = 300, units = "in" )
|
||||||
|
plimp + ggsave("img/xgbt_importances.png", device = 'png', bg = "transparent",
|
||||||
|
width = 8, height = 6, dpi = 300, units = "in" )
|
||||||
|
|
||||||
|
|
||||||
|
dffeatures[,"predictions_gbmt"] = predict(gbmt_fit, newdata = dffeatures,
|
||||||
|
n.trees = best_iter_cv,
|
||||||
|
type = "response", na.action = na.pass)
|
||||||
|
|
||||||
|
# GBM-CARET ---------------------------------------------------
|
||||||
|
|
||||||
|
control <- trainControl(method = "cv",
|
||||||
|
number = 10,
|
||||||
|
p =.8,
|
||||||
|
savePredictions = TRUE,
|
||||||
|
classProbs = TRUE,
|
||||||
|
summaryFunction = twoClassSummary)
|
||||||
|
|
||||||
|
tuneGrid <- expand.grid(n.trees = c(80,100,120,140,160),
|
||||||
|
shrinkage=c(0.025, 0.05, 0.1, 0.2),
|
||||||
|
interaction.depth = c(1,2),
|
||||||
|
n.minobsinnode = c(10, 15))
|
||||||
|
|
||||||
|
gbmFit1 <- train(label ~ .,
|
||||||
|
data = dffeatures.labelled.tr_set,
|
||||||
|
method = "gbm",
|
||||||
|
na.action = na.pass,
|
||||||
|
tuneGrid=tuneGrid,
|
||||||
|
## This last option is actually one
|
||||||
|
## for gbm() that passes through
|
||||||
|
metric = "ROC",
|
||||||
|
trControl = control,
|
||||||
|
# importance = TRUE,
|
||||||
|
verbose = FALSE)
|
||||||
|
gbmFit1
|
||||||
|
|
||||||
|
## Feature Importance Plotting ---------------------------------------------
|
||||||
|
|
||||||
|
gbmsmmry <- summary(gbmFit1, normalize=T, plotit=F)
|
||||||
|
|
||||||
|
gbmsmmry <- gbmsmmry[gbmsmmry$rel.inf>0,]
|
||||||
|
|
||||||
|
|
||||||
|
ggplot(data=gbmsmmry) +
|
||||||
|
geom_segment(size=3, colour='red') +
|
||||||
|
aes(x=reorder(var,rel.inf, sum),
|
||||||
|
xend = var,
|
||||||
|
y = 0.002,
|
||||||
|
yend=(rel.inf),
|
||||||
|
label=rel.inf) +
|
||||||
|
scale_y_log10() +
|
||||||
|
ylab("relative influence") + xlab("") +
|
||||||
|
coord_flip()
|
||||||
|
|
||||||
|
saveRDS(gbmFit1, "gbm_ntrees80_interactiondepth2_shrinkage0.2_nminobsinnode15_trainset_2018-02-18.rds")
|
||||||
|
|
||||||
|
dffeatures[,"predictions_gbm"] = predict(gbmFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
|
||||||
|
|
||||||
|
# RPART -----------------------------------------------------------------
|
||||||
|
|
||||||
|
tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
|
||||||
|
|
||||||
|
rpartFit1 <- train(label ~ ., data = dffeatures.labelled.tr_set,
|
||||||
|
method = "rpart",
|
||||||
|
na.action = na.pass,
|
||||||
|
tuneGrid=tuneGrid,
|
||||||
|
## This last option is actually one
|
||||||
|
## for gbm() that passes through
|
||||||
|
metric = "ROC",
|
||||||
|
trControl = control
|
||||||
|
)
|
||||||
|
varImp(rpartFit1)
|
||||||
|
|
||||||
|
|
||||||
|
predictions.ts_set = predict(rpartFit1,
|
||||||
|
newdata = dffeatures.labelled.ts_set,
|
||||||
|
type='prob', na.action = na.pass)
|
||||||
|
|
||||||
|
dffeatures[,"predictions_rpart"] = predict(rpartFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
|
||||||
|
|
||||||
|
# XGB ---------------------------------------------------------------------
|
||||||
|
control <- trainControl(method="cv", number=10)
|
||||||
|
#classProbs = TRUE
|
||||||
|
|
||||||
|
#tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
|
||||||
|
xgbFit <- train(label ~ ., data = dffeatures.labelled.tr_set,
|
||||||
|
method = "xgbTree",
|
||||||
|
na.action = na.pass,
|
||||||
|
#tuneGrid=tuneGrid,
|
||||||
|
metric = "Accuracy",
|
||||||
|
trControl = control)
|
||||||
|
|
||||||
|
varImp(xgbFit, scale=T)
|
||||||
|
|
||||||
|
as.data.frame(xgbFit$finalModel$params)
|
||||||
|
|
||||||
|
xgbFit$bestTune
|
||||||
|
|
||||||
|
saveRDS(xgbFit, sprintf("xgbtree_maxdepth1_subsample1_eta0.3_%s.rds", Sys.Date()))
|
||||||
|
|
||||||
|
predictions.ts_set = predict(xgbFit,
|
||||||
|
newdata = dffeatures.labelled.ts_set,
|
||||||
|
type='prob', na.action = na.pass)
|
||||||
|
|
||||||
|
|
||||||
|
## Save all predictions ---------------------------------------------------------
|
||||||
|
|
||||||
|
dffeatures[,"predictions_xgb"] = predict(xgbFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
|
||||||
|
|
||||||
|
write.table(dffeatures[, c(grep('prediction',colnames(dffeatures), value=T),
|
||||||
|
"ViewModifierCodeMeaning", "ViewCodeValue")],
|
||||||
|
file = "all_predictions_allmodels_trained_on_train.tab", quote=F, sep='\t')
|
||||||
|
|
||||||
@@ -0,0 +1,170 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
############################################################################
|
||||||
|
# stratify by BT column: those are 100% sure digital, others can be either
|
||||||
|
############################################################################
|
||||||
|
rm(list=ls())
|
||||||
|
setwd(dir = "~/repos/mammo/learn_spotmag_from_dicom_headers")
|
||||||
|
#cell#
|
||||||
|
library(caret)
|
||||||
|
library(data.table)
|
||||||
|
|
||||||
|
library(pROC)
|
||||||
|
# install.packages(c("pROC"))
|
||||||
|
library(ggplot2)
|
||||||
|
library(fastmatch)
|
||||||
|
|
||||||
|
read.gz <- function(filename, ...){
|
||||||
|
as.data.frame(fread(paste("zcat < ",filename),
|
||||||
|
header=TRUE, fill = TRUE, ...))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn_ids = "../tables/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz"
|
||||||
|
ids = read.gz(fn_ids, select="id")$id
|
||||||
|
|
||||||
|
fn_features = "../tables/mammo_dicom_headers/df_all_mammos_dicom_headers_selected_nona.tab.gz"
|
||||||
|
dffeatures = read.gz(fn_features, sep='\t')
|
||||||
|
|
||||||
|
# rownames(dffeatures) <- dffeatures$filename
|
||||||
|
print(nrow(dffeatures))
|
||||||
|
print(length(ids))
|
||||||
|
|
||||||
|
dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
|
||||||
|
dffeatures <- dffeatures[!is.na(dffeatures$filename),]
|
||||||
|
|
||||||
|
rm(ids)
|
||||||
|
|
||||||
|
collist = c("BodyPartThickness", "XRayTubeCurrentInuA", "ContentTime",
|
||||||
|
"DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
|
||||||
|
for (cc in collist){
|
||||||
|
dffeatures[,cc] <- as.numeric(dffeatures[,cc])
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# (head(as.numeric(dffeatures$BodyPartThickness)))
|
||||||
|
dtypes = sapply(dffeatures, class)
|
||||||
|
|
||||||
|
row.names(dffeatures) = dffeatures$filename
|
||||||
|
excludeCols <- c("filename",
|
||||||
|
"CollimatorLeftVerticalEdge",
|
||||||
|
"CollimatorLowerHorizontalEdge",
|
||||||
|
"DistanceSourceToEntrance",
|
||||||
|
"ExposuresOnDetectorSinceLastCalibration",
|
||||||
|
"ExposuresOnDetectorSinceManufactured",
|
||||||
|
"ShutterLowerHorizontalEdge",
|
||||||
|
"ShutterRightVerticalEdge",
|
||||||
|
"XRayTubeCurrentInuA"
|
||||||
|
# "ManufacturerModelName"
|
||||||
|
)
|
||||||
|
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
|
||||||
|
|
||||||
|
|
||||||
|
catcols <- c('ViewModifierCodeMeaning',
|
||||||
|
'ViewCodeValue',
|
||||||
|
'DetectorActiveDimensionsMissing',
|
||||||
|
'FieldOfViewOriginMissing',
|
||||||
|
'Grid',
|
||||||
|
'Manufacturer',
|
||||||
|
'ManufacturerModelName')
|
||||||
|
|
||||||
|
for (cc in catcols){
|
||||||
|
dffeatures[,cc] = paste0("=", dffeatures[,cc])
|
||||||
|
dffeatures[,cc] = as.factor(dffeatures[,cc])
|
||||||
|
}
|
||||||
|
|
||||||
|
dffeatures[,"HighBit"] <- as.numeric(dffeatures[,"HighBit"])
|
||||||
|
|
||||||
|
colSums(sapply(dffeatures, is.na))
|
||||||
|
|
||||||
|
# Read labels ---------------------------------
|
||||||
|
|
||||||
|
fn.labelledset = "../tables/spotmag_predictions/train_test_split-2018-02-15-within7e5.csv"
|
||||||
|
# filelist.labelled = read.table(fn.labelledset, )
|
||||||
|
df.labelled = as.data.frame(fread(fn.labelledset))
|
||||||
|
rownames(df.labelled) <- df.labelled$id
|
||||||
|
vec.labelled = df.labelled$id
|
||||||
|
df.labelled$label <- as.factor(df.labelled$label)
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
|
||||||
|
vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
|
||||||
|
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
|
||||||
|
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
|
||||||
|
############################################################
|
||||||
|
dffeatures.labelled <- dffeatures[vec.labelled,]
|
||||||
|
dffeatures.labelled$label <- df.labelled$label
|
||||||
|
|
||||||
|
dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
|
||||||
|
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
|
||||||
|
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
|
||||||
|
|
||||||
|
table(dffeatures.labelled.tr_set$label)
|
||||||
|
|
||||||
|
|
||||||
|
goodrows <- 1 - colSums(sapply(dffeatures.labelled.tr_set, is.na)) / nrow(dffeatures.labelled.tr_set)
|
||||||
|
|
||||||
|
names(goodrows[goodrows<0.1])
|
||||||
|
|
||||||
|
|
||||||
|
for (cc in colnames(dffeatures.labelled.tr_set)){
|
||||||
|
if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
|
||||||
|
setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
|
||||||
|
if (length(setdiff_)>0){
|
||||||
|
print(cc)
|
||||||
|
print(setdiff_)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# GLMNET ---------------------------------------------------------------------
|
||||||
|
|
||||||
|
library(glmnet)
|
||||||
|
# Using glmnet to directly perform CV
|
||||||
|
set.seed(0)
|
||||||
|
|
||||||
|
x_train <- model.matrix( ~ .-1, dffeatures.labelled.tr_set[,!(colnames(dffeatures.labelled.tr_set) %in% c("label"))])
|
||||||
|
dim(x_train)
|
||||||
|
|
||||||
|
cvob1=cv.glmnet(x=x_train,
|
||||||
|
y=dffeatures.labelled.tr_set[,"label"],
|
||||||
|
family="binomial",alpha=1,
|
||||||
|
type.measure="auc", nfolds = 5, lambda = seq(0.001,0.1,by = 0.001),
|
||||||
|
standardize=FALSE)
|
||||||
|
plot(cvob1)
|
||||||
|
|
||||||
|
control <- trainControl(method="cv", number=5, returnResamp="all",
|
||||||
|
classProbs=TRUE, summaryFunction=twoClassSummary)
|
||||||
|
#classProbs = TRUE
|
||||||
|
|
||||||
|
tuneGrid <- expand.grid(alpha=c(0.00, 0.25, 0.50, 0.75, 0.99, 1.00), lambda = 10^seq(-5,-2,0.5))
|
||||||
|
tune = list()
|
||||||
|
fits = list()
|
||||||
|
rocs = list()
|
||||||
|
for (ii in 1:5){
|
||||||
|
glmnetFit <- train(label ~ ., data = dffeatures.labelled.tr_set,
|
||||||
|
method = "glmnet",
|
||||||
|
na.action = na.pass,
|
||||||
|
tuneGrid=tuneGrid,
|
||||||
|
metric = "ROC",
|
||||||
|
trControl = control)
|
||||||
|
fits[[ii]] <- glmnetFit
|
||||||
|
tune[[ii]] <- glmnetFit$bestTune
|
||||||
|
rocs[[ii]] <- max(glmnetFit$results$ROC)
|
||||||
|
}
|
||||||
|
|
||||||
|
tune
|
||||||
|
|
||||||
|
varImp(glmnetFit, scale=T)
|
||||||
|
as.data.frame(glmnetFit$bestTune)
|
||||||
|
|
||||||
|
saveRDS(glmnetFit, sprintf("glmnet.rds", Sys.Date()))
|
||||||
|
|
||||||
|
## Save predictions ---------------------------------------------------------
|
||||||
|
|
||||||
|
dffeatures[,"predictions_glmnet"] = predict(glmnetFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
|
||||||
|
|
||||||
|
write.table(dffeatures[,c("predictions_glmnet"), drop=F],
|
||||||
|
file="all_predictions_glmnet.tab", quote=F, sep='\t')
|
||||||
|
|
||||||
@@ -0,0 +1,763 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
|
||||||
|
" return f(*args, **kwds)\n",
|
||||||
|
"/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
|
||||||
|
" return f(*args, **kwds)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import os"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Read labels"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tabledir = \"../tables/\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(772423, 1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"fn = f\"{tabledir}/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz\"\n",
|
||||||
|
"df_bt = pd.read_csv(fn, usecols=[\"id\", \"BT_case\"])\n",
|
||||||
|
"df_bt.set_index(\"id\", inplace=True)\n",
|
||||||
|
"df_bt = ~df_bt.isnull()\n",
|
||||||
|
"df_bt.columns = [\"digital\"]\n",
|
||||||
|
"df_bt.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>set</th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>view</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>id</th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1013372709_1.2.840.113654.2.70.1.175625299786291545159233542096043464711_3_1</th>\n",
|
||||||
|
" <td>test</td>\n",
|
||||||
|
" <td>normal</td>\n",
|
||||||
|
" <td>N</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1028995243_1.2.840.113654.2.70.1.56947963181878834591544466761404805157_45576_2</th>\n",
|
||||||
|
" <td>test</td>\n",
|
||||||
|
" <td>normal</td>\n",
|
||||||
|
" <td>N</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1105112884_1.2.840.113654.2.70.1.178729598744204462442695104630823323474_8905_2</th>\n",
|
||||||
|
" <td>test</td>\n",
|
||||||
|
" <td>normal</td>\n",
|
||||||
|
" <td>N</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1185125156_1.2.840.113654.2.70.1.45840593750642722243371816041014016032_2_4</th>\n",
|
||||||
|
" <td>test</td>\n",
|
||||||
|
" <td>normal</td>\n",
|
||||||
|
" <td>N</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1496452586_1.2.840.113654.2.70.1.5582568668770891599992528318631583880_1351_4</th>\n",
|
||||||
|
" <td>test</td>\n",
|
||||||
|
" <td>normal</td>\n",
|
||||||
|
" <td>N</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" set label view\n",
|
||||||
|
"id \n",
|
||||||
|
"1013372709_1.2.840.113654.2.70.1.17562529978629... test normal N\n",
|
||||||
|
"1028995243_1.2.840.113654.2.70.1.56947963181878... test normal N\n",
|
||||||
|
"1105112884_1.2.840.113654.2.70.1.17872959874420... test normal N\n",
|
||||||
|
"1185125156_1.2.840.113654.2.70.1.45840593750642... test normal N\n",
|
||||||
|
"1496452586_1.2.840.113654.2.70.1.55825686687708... test normal N"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"infile = f\"{tabledir}/spotmag_predictions/train_test_split-2018-02-16-within7e5-label.csv\"\n",
|
||||||
|
"dflab = pd.read_csv(infile, index_col='id')\n",
|
||||||
|
"dflab[:5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Read header-based predictions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(772367, 1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_glmnet.tab\"\n",
|
||||||
|
"dfpred_glmnet = pd.read_table(infile, index_col=0)\n",
|
||||||
|
"dfpred_glmnet.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred_glmnet.columns]\n",
|
||||||
|
"dfpred_glmnet.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(772367, 5)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>score_gbm</th>\n",
|
||||||
|
" <th>score_xgb</th>\n",
|
||||||
|
" <th>score_rpart</th>\n",
|
||||||
|
" <th>score_xgbt</th>\n",
|
||||||
|
" <th>ViewModifierCodeMeaning</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>id</th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104556</th>\n",
|
||||||
|
" <td>0.009005</td>\n",
|
||||||
|
" <td>0.020207</td>\n",
|
||||||
|
" <td>0.006882</td>\n",
|
||||||
|
" <td>0.059474</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104557</th>\n",
|
||||||
|
" <td>0.013337</td>\n",
|
||||||
|
" <td>0.016762</td>\n",
|
||||||
|
" <td>0.006882</td>\n",
|
||||||
|
" <td>0.059660</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141538</th>\n",
|
||||||
|
" <td>0.013337</td>\n",
|
||||||
|
" <td>0.016762</td>\n",
|
||||||
|
" <td>0.006882</td>\n",
|
||||||
|
" <td>0.061051</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141537</th>\n",
|
||||||
|
" <td>0.013337</td>\n",
|
||||||
|
" <td>0.016762</td>\n",
|
||||||
|
" <td>0.006882</td>\n",
|
||||||
|
" <td>0.061051</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3337971863_1.2.840.113654.2.70.1.337982194343327746313656933304494759333_1_1</th>\n",
|
||||||
|
" <td>0.031560</td>\n",
|
||||||
|
" <td>0.059142</td>\n",
|
||||||
|
" <td>0.006882</td>\n",
|
||||||
|
" <td>0.157488</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" score_gbm score_xgb \\\n",
|
||||||
|
"id \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.009005 0.020207 \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.013337 0.016762 \n",
|
||||||
|
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.031560 0.059142 \n",
|
||||||
|
"\n",
|
||||||
|
" score_rpart score_xgbt \\\n",
|
||||||
|
"id \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059474 \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.059660 \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... 0.006882 0.061051 \n",
|
||||||
|
"3337971863_1.2.840.113654.2.70.1.33798219434332... 0.006882 0.157488 \n",
|
||||||
|
"\n",
|
||||||
|
" ViewModifierCodeMeaning \n",
|
||||||
|
"id \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
||||||
|
"2454166001_1.2.840.113654.2.70.1.26994792635520... NaN \n",
|
||||||
|
"3337971863_1.2.840.113654.2.70.1.33798219434332... NaN "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"infile = f\"{tabledir}/spotmag_predictions/all_predictions_allmodels_trained_on_train.tab\"\n",
|
||||||
|
"dfpred = pd.read_table(infile, index_col=0)\n",
|
||||||
|
"dfpred.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred.columns]\n",
|
||||||
|
"dfpred.index.name = 'id'\n",
|
||||||
|
"print(dfpred.shape)\n",
|
||||||
|
"dfpred[:5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(772367, 8)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"if 'set' not in dfpred.columns:\n",
|
||||||
|
" dfpred = dfpred.merge(dflab, left_index=True, right_index=True, how='left')\n",
|
||||||
|
" print(dfpred.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"normal 3526\n",
|
||||||
|
"magn/spot 572\n",
|
||||||
|
"wire loc 57\n",
|
||||||
|
"stereotactic 25\n",
|
||||||
|
"other 9\n",
|
||||||
|
"Name: view, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"colmap = {\"N\":\"normal\", \"M\": \"magn/spot\",\n",
|
||||||
|
" \"T\":\"stereotactic\", \"W\":\"wire loc\", \"X\":\"other\"}\n",
|
||||||
|
"view_counts = dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]).value_counts()\n",
|
||||||
|
"view_counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th>set</th>\n",
|
||||||
|
" <th>train</th>\n",
|
||||||
|
" <th>test</th>\n",
|
||||||
|
" <th>val</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>view</th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>magn/spot</th>\n",
|
||||||
|
" <td>380</td>\n",
|
||||||
|
" <td>96</td>\n",
|
||||||
|
" <td>96</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>normal</th>\n",
|
||||||
|
" <td>2310</td>\n",
|
||||||
|
" <td>612</td>\n",
|
||||||
|
" <td>604</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>other</th>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>stereotactic</th>\n",
|
||||||
|
" <td>17</td>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>wire loc</th>\n",
|
||||||
|
" <td>37</td>\n",
|
||||||
|
" <td>11</td>\n",
|
||||||
|
" <td>9</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"set train test val\n",
|
||||||
|
"view \n",
|
||||||
|
"magn/spot 380 96 96\n",
|
||||||
|
"normal 2310 612 604\n",
|
||||||
|
"other 4 3 2\n",
|
||||||
|
"stereotactic 17 4 4\n",
|
||||||
|
"wire loc 37 11 9"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pd.crosstab(dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]), dfpred.set)[[\"train\", \"test\", \"val\"]]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Read image-based predictions (general)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"../tables//spotmag_predictions/predictions_images_4189-epoch55-e5ce2d69b035975cb5336cec0da9a32a.csv\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Index(['score_image', 'score_image_max'], dtype='object')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"tag = \"e5ce2d69b035975cb5336cec0da9a32a\"\n",
|
||||||
|
"epoch = 55\n",
|
||||||
|
"infile = f\"{tabledir}/spotmag_predictions/predictions_images_4189-epoch{epoch}-{tag}.csv\"\n",
|
||||||
|
"# infile = f\"{tabledir}/spotmag_predictions/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl-spotmag_img_prediction-{tag}.csv\"\n",
|
||||||
|
"print(infile)\n",
|
||||||
|
"dfpred_img = pd.read_csv(infile, index_col=0)\n",
|
||||||
|
"dfpred_img = dfpred_img[['score_image', 'score_image_max']]\n",
|
||||||
|
"dfpred_img = dfpred_img.groupby(level=0).mean()\n",
|
||||||
|
"dfpred_img.columns"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Read image-based predictions (wire localization)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"infile = f\"{tabledir}/spotmag_predictions/predictions_wire_combined_e8e71fc090141d7c6fb334359152d295.csv\"\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred_imgwire = pd.read_csv(infile, index_col=0)\n",
|
||||||
|
"dfpred_imgwire[\"score_wire_max\"] = 1-dfpred_imgwire[[\"scores_0_or\",\"scores_0_fl\"]].min(1)\n",
|
||||||
|
"dfpred_imgwire = dfpred_imgwire.drop([\"scores_0_or\",\"scores_0_fl\", \"label\"], axis=1)\n",
|
||||||
|
"dfpred_imgwire.columns = [cc.replace(\"scores\", \"score_wire\") for cc in dfpred_imgwire.columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(772367, 13)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"if 'score_image' not in dfpred.columns:\n",
|
||||||
|
" dfpred = pd.concat([dfpred, dfpred_img], axis=1)\n",
|
||||||
|
" dfpred.index.name = 'id'\n",
|
||||||
|
" del dfpred_img\n",
|
||||||
|
" \n",
|
||||||
|
"if 'score_glmnet' not in dfpred.columns:\n",
|
||||||
|
" dfpred = pd.concat([dfpred, dfpred_glmnet], axis=1)\n",
|
||||||
|
" dfpred.index.name = 'id'\n",
|
||||||
|
" del dfpred_glmnet\n",
|
||||||
|
" \n",
|
||||||
|
"if 'score_wire' not in dfpred.columns:\n",
|
||||||
|
" dfpred = pd.concat([dfpred, dfpred_imgwire], axis=1)\n",
|
||||||
|
" dfpred.index.name = 'id'\n",
|
||||||
|
" del dfpred_imgwire\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if 'label' not in dfpred.columns:\n",
|
||||||
|
" dfpred = pd.concat([dfpred, dflab], axis=1)\n",
|
||||||
|
"if 'digital' not in dfpred.columns:\n",
|
||||||
|
" dfpred = pd.concat([dfpred, df_bt], axis=1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th>score_image</th>\n",
|
||||||
|
" <th>False</th>\n",
|
||||||
|
" <th>True</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>score_wire</th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>False</th>\n",
|
||||||
|
" <td>3584</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>True</th>\n",
|
||||||
|
" <td>605</td>\n",
|
||||||
|
" <td>768234</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"score_image False True \n",
|
||||||
|
"score_wire \n",
|
||||||
|
"False 3584 0\n",
|
||||||
|
"True 605 768234"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pd.crosstab(dfpred[\"score_wire\"].isnull(), dfpred[\"score_image\"].isnull())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dfpred.rename(columns={\"score_xgbt\":\"score_gbmt\"}, inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Add ensembled (max, avg) scores"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dfpred['score_wire'] = dfpred['score_wire'].fillna(0)\n",
|
||||||
|
"dfpred['score_wire_max'] = dfpred['score_wire_max'].fillna(0)\n",
|
||||||
|
"dfpred['score_image+glmnet'] = (dfpred['score_image'] + dfpred['score_glmnet'])/2\n",
|
||||||
|
"dfpred['score_image+gbmt'] = (dfpred['score_image'] + dfpred['score_gbmt'])/2\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred['score_max(image;gbmt)'] = dfpred[['score_image','score_gbmt']].max(1)\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred['score_image*glmnet'] = np.sqrt(dfpred['score_image'] * dfpred['score_glmnet'])\n",
|
||||||
|
"dfpred['score_image*gbmt'] = np.sqrt(dfpred['score_image'] * dfpred['score_gbmt'])\n",
|
||||||
|
"dfpred['score_max_image_wire'] = np.nanmax(dfpred[['score_image','score_wire']].values, axis=1)\n",
|
||||||
|
"dfpred['score_max_image_wire_max'] = np.nanmax(dfpred[['score_image','score_wire_max']].values, axis=1)\n",
|
||||||
|
"# dfpred['score_wire'].isnull()\n",
|
||||||
|
"dfpred['score_max_image_wire+gbmt'] =(dfpred['score_max_image_wire'] + dfpred['score_gbmt'])/2\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred['score_max_image_wire_max+gbmt'] =(dfpred['score_max_image_wire_max'] + dfpred['score_gbmt'])/2\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred['score_max(image;wire_max;gbmt)'] = dfpred[['score_wire_max','score_gbmt', 'score_image']].max(1)\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred['score_max_wire_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire']].values, axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"dfpred['score_max_wire_max_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire_max']].values, axis=1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dfpred.rename(columns={\"ViewModifierCodeMeaning\":\"ViewModifier\"}, inplace=True)\n",
|
||||||
|
"dfpred.index.name = 'id'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Save the combined table"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"772423"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(dfpred)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dfpred.to_csv(f'{tabledir}/all_predictions_with_images-{tag}.tab', sep='\\t')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python [default]",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@@ -0,0 +1,98 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import dicom
|
||||||
|
from warnings import warn
|
||||||
|
|
||||||
|
def get_tuples(plan, outlist = None, key = ""):
|
||||||
|
if len(key)>0:
|
||||||
|
key = key + "_"
|
||||||
|
if not outlist:
|
||||||
|
outlist = []
|
||||||
|
for aa in plan.dir():
|
||||||
|
if (hasattr(plan, aa) and aa!='PixelData'):
|
||||||
|
value = getattr(plan, aa)
|
||||||
|
if type(value) is dicom.sequence.Sequence:
|
||||||
|
# if len(list(value))==1:
|
||||||
|
# outlist.extend(get_tuples(list(value)[0], outlist = None, key = key+aa))
|
||||||
|
# else:
|
||||||
|
for nn, ss in enumerate(list(value)):
|
||||||
|
newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa])
|
||||||
|
outlist.extend(get_tuples(ss, outlist = None, key = newkey))
|
||||||
|
else:
|
||||||
|
if type(value) is dicom.valuerep.DSfloat:
|
||||||
|
value = float(value)
|
||||||
|
elif type(value) is dicom.valuerep.IS:
|
||||||
|
value = str(value)
|
||||||
|
elif type(value) is dicom.valuerep.MultiValue:
|
||||||
|
value = tuple(value)
|
||||||
|
elif type(value) is dicom.UID.UID:
|
||||||
|
value = str(value)
|
||||||
|
outlist.append((key + aa, value))
|
||||||
|
return outlist
|
||||||
|
|
||||||
|
|
||||||
|
def filter_row_common_field(row, common_fields):
|
||||||
|
for kk in list(row.keys()):
|
||||||
|
if kk not in common_fields:
|
||||||
|
row.pop(kk)
|
||||||
|
return row
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
fn_allheaders = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/dicom_headers_all_fields_filelist_nonscreening_4000_seed42.csv'
|
||||||
|
|
||||||
|
df_allheaders = pd.read_csv(fn_allheaders, index_col=0)
|
||||||
|
|
||||||
|
|
||||||
|
"at least 5% of rows are there"
|
||||||
|
thr = 0.05
|
||||||
|
valid_fields = (~df_allheaders.isnull()).mean() > thr
|
||||||
|
valid_fields = valid_fields[valid_fields].index.tolist()
|
||||||
|
print(len(valid_fields))
|
||||||
|
"""
|
||||||
|
|
||||||
|
valid_fields = pd.read_table("/data/dlituiev/learn_spotmag_from_dicom_headers/LogisticRegression_common_fields_names.tab",
|
||||||
|
header=None,
|
||||||
|
squeeze=True).values
|
||||||
|
|
||||||
|
|
||||||
|
#filelist_fn = '/home/dlituiev/data_dlituiev/tables/df_newest_mammos.pickle'
|
||||||
|
filelist_fn = "/home/dlituiev/data_dlituiev/tables/2017-06-mammo_tables/df_original_mammos.pickle"
|
||||||
|
filelist = pd.read_pickle(filelist_fn, )["Filename"].unique().tolist()
|
||||||
|
len(filelist)
|
||||||
|
|
||||||
|
BUFFER_N_LINES = 100
|
||||||
|
SEP = '\t'
|
||||||
|
outpath = filelist_fn.replace('.pickle','') + '_dicom_headers_selected.tab'
|
||||||
|
final_columns = ['filename'] + list(valid_fields)
|
||||||
|
print("len(final_columns)", len(final_columns) )
|
||||||
|
print('saving to %s' % outpath)
|
||||||
|
with open(outpath, 'w+') as outfh:
|
||||||
|
outfh.write(SEP.join(final_columns) + '\n')
|
||||||
|
headerlist = []
|
||||||
|
for nn, ff in enumerate(filelist):
|
||||||
|
if nn% BUFFER_N_LINES == (BUFFER_N_LINES-1):
|
||||||
|
df_hl = pd.DataFrame( headerlist, columns=final_columns)
|
||||||
|
df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
|
||||||
|
outfh.flush()
|
||||||
|
del df_hl
|
||||||
|
print(nn+1)
|
||||||
|
headerlist = []
|
||||||
|
try:
|
||||||
|
plan = dicom.read_file(ff)
|
||||||
|
row = get_tuples(plan)
|
||||||
|
row = dict(row)
|
||||||
|
row = tuple([ff] + [(row[kk] if (kk in row) else np.nan) for kk in valid_fields ])
|
||||||
|
print("len(row)", len(row))
|
||||||
|
headerlist.append(row)
|
||||||
|
except Exception as ex:
|
||||||
|
# raise ex
|
||||||
|
warn('header extraction failed on #\t%s\t%s\t%s' % (nn, ff, ex))
|
||||||
|
# in the end, print the rest:
|
||||||
|
df_hl = pd.DataFrame( headerlist, columns=final_columns)
|
||||||
|
df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
|
||||||
|
outfh.flush()
|
||||||
|
|
||||||
|
print("DONE")
|
||||||
@@ -0,0 +1,798 @@
|
|||||||
|
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from functools import partial
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
|
def entropy(x):
|
||||||
|
f = x.value_counts()
|
||||||
|
# f.loc["nan"] = x.isnull().sum()
|
||||||
|
return (f*f.map(np.log2)).sum()
|
||||||
|
|
||||||
|
|
||||||
|
def select_text_fields(df_allheaders):
|
||||||
|
text_fields = df_allheaders.dtypes.map(lambda x: x is pd.np.dtype(object))
|
||||||
|
text_fields = text_fields[text_fields].index.tolist()
|
||||||
|
len(text_fields)
|
||||||
|
text_fields = (~df_allheaders[text_fields].isnull()).mean() > 0.05
|
||||||
|
|
||||||
|
text_fields = text_fields[text_fields].index.tolist()
|
||||||
|
remove_list = []
|
||||||
|
for tt in text_fields:
|
||||||
|
numunique = len(df_allheaders[tt].unique())
|
||||||
|
entr = entropy(df_allheaders[tt])
|
||||||
|
if entr<1000 | (numunique == 1) | (numunique > 0.75*df_allheaders.shape[1]):
|
||||||
|
remove_list.append(tt)
|
||||||
|
|
||||||
|
for tt in remove_list:
|
||||||
|
text_fields.remove(tt)
|
||||||
|
|
||||||
|
len(text_fields)
|
||||||
|
return text_fields
|
||||||
|
|
||||||
|
|
||||||
|
def get_good_numeric_fields(df_allheaders, thr_stderr = 1e-6):
|
||||||
|
stderr = df_allheaders.std()/df_allheaders.mean()
|
||||||
|
field_list = stderr[stderr> thr_stderr].index.tolist()
|
||||||
|
return field_list
|
||||||
|
|
||||||
|
|
||||||
|
def get_index_from_int_tuple(x, ind):
|
||||||
|
if type(x) is str:
|
||||||
|
x = eval(x)
|
||||||
|
return int(float(x[ind]))
|
||||||
|
else:
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def clean_up_field_list(field_list,
|
||||||
|
prefices_remove = ["date", "accession", "number",
|
||||||
|
"Filename",
|
||||||
|
"ImageLaterality",
|
||||||
|
"GantryID",
|
||||||
|
#"0_ViewCodeSequence_CodeMeaning",
|
||||||
|
"ViewCodeSequence_CodeMeaning",
|
||||||
|
"ViewModifierCodeSequence_CodeValue",
|
||||||
|
"EthnicGroup",
|
||||||
|
"BodyPartExamined",
|
||||||
|
"LossyImageCompression",
|
||||||
|
"DeidentificationMethodCodeSequence",
|
||||||
|
"UID",
|
||||||
|
'EntranceDoseInmGy',
|
||||||
|
'ProcedureCodeSequence_CodeMeaning',
|
||||||
|
'CommentsOnRadiationDose',
|
||||||
|
'DetectorID',
|
||||||
|
'SeriesDescription', # potentially informative but too many values
|
||||||
|
'SoftwareVersions',
|
||||||
|
'PatientAge',
|
||||||
|
],
|
||||||
|
fields_remove = [ 'PatientID', 'PatientName', "BitsStored",
|
||||||
|
'AcquisitionTime',
|
||||||
|
'AdmittingTime',
|
||||||
|
'ScheduledStudyStartTime',
|
||||||
|
'InstanceCreationTime',
|
||||||
|
'PerformedProcedureStepStartTime',
|
||||||
|
'PregnancyStatus',
|
||||||
|
'StudyArrivalTime',
|
||||||
|
'StudyCompletionTime',
|
||||||
|
'StudyTime',
|
||||||
|
'TimeOfLastCalibration',
|
||||||
|
'TimeOfLastDetectorCalibration',
|
||||||
|
'TimeOfSecondaryCapture',]):
|
||||||
|
|
||||||
|
prefices_remove = [x.lower() for x in prefices_remove]
|
||||||
|
|
||||||
|
for ff in field_list:
|
||||||
|
for pp in prefices_remove:
|
||||||
|
if pp in ff.lower():
|
||||||
|
if ff not in fields_remove:
|
||||||
|
fields_remove.append(ff)
|
||||||
|
|
||||||
|
for ff in fields_remove:
|
||||||
|
try:
|
||||||
|
field_list.remove(ff)
|
||||||
|
except ValueError as ve:
|
||||||
|
print(ff, ve)
|
||||||
|
return field_list
|
||||||
|
|
||||||
|
|
||||||
|
def make_lowercase_text_fields(df_allheaders):
|
||||||
|
"""## make all text fields lowercase
|
||||||
|
(except accession and file name)"""
|
||||||
|
for cname in df_allheaders.columns[1:]:
|
||||||
|
cc = df_allheaders[cname]
|
||||||
|
if cc.dtype is np.dtype(object):
|
||||||
|
df_allheaders[cname] = cc.str.lower()
|
||||||
|
return df_allheaders
|
||||||
|
|
||||||
|
|
||||||
|
def format_PixelSpacing(x):
|
||||||
|
if type(x) is float:
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
xstr = x.lstrip("(").rstrip(")").replace("'", "").replace(" ","").split(",")
|
||||||
|
return np.unique(tuple([float(y) for y in xstr]))[0]
|
||||||
|
|
||||||
|
def parse_float(x):
|
||||||
|
x = str(x).replace("'","").replace("b","").replace("None","nan")
|
||||||
|
if x == "":
|
||||||
|
x = np.nan
|
||||||
|
return x
|
||||||
|
|
||||||
|
def parse_float_tuples(x, to_int=False):
|
||||||
|
x = list(str(x))
|
||||||
|
for nn,ss in enumerate(x):
|
||||||
|
if not ss.isdigit() and ss!='.':
|
||||||
|
x[nn] = ';'
|
||||||
|
x = "".join(x).split(';')
|
||||||
|
if to_int:
|
||||||
|
x = tuple([int(float(dd)) for dd in x if len(dd)])
|
||||||
|
else:
|
||||||
|
x = tuple([float(dd) for dd in x if len(dd)])
|
||||||
|
if type(x) is not tuple:
|
||||||
|
raise TypeError("returned non-list: {}".format(str(x)))
|
||||||
|
return x
|
||||||
|
|
||||||
|
def parse_float_tuples_prod(x):
|
||||||
|
if x not in (None, np.nan) and len(x)>0:
|
||||||
|
x = str(x)
|
||||||
|
assert type(x) is str
|
||||||
|
x = parse_float_tuples(x)
|
||||||
|
if type(x) is not tuple:
|
||||||
|
raise TypeError("returned non-list: {} of type {}".format(str(x), type(x)))
|
||||||
|
try:
|
||||||
|
x = np.prod(x)
|
||||||
|
except TypeError as ee:
|
||||||
|
print('"%s"' % x)
|
||||||
|
raise ee
|
||||||
|
else:
|
||||||
|
x = np.nan
|
||||||
|
return x
|
||||||
|
|
||||||
|
def parse_int_tuples_median(x):
|
||||||
|
x = parse_float_tuples(x)
|
||||||
|
x = np.median(x)
|
||||||
|
return x
|
||||||
|
"""
|
||||||
|
def parse_float_tuples(x):
|
||||||
|
x = eval(x) if type(x) is str else x
|
||||||
|
if type(x) in [tuple, list]:
|
||||||
|
x = tuple([float(y) for y in x])
|
||||||
|
return x
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_str_tuples(x):
|
||||||
|
try:
|
||||||
|
x = eval(x) if type(x) is str else x
|
||||||
|
except:
|
||||||
|
x = tuple(x.split(" ")) if type(x) is str else x
|
||||||
|
return x
|
||||||
|
#############################33
|
||||||
|
def extract_list_text_field(df_allheaders, colprefix = "ViewModifierCodeSequence_CodeMeaning"):
|
||||||
|
allcols = df_allheaders.columns
|
||||||
|
cols = allcols[np.asarray(allcols.map(lambda x: colprefix in x and x!=colprefix), dtype=bool)]
|
||||||
|
|
||||||
|
ViewModifierCodeSequence_CodeMeaning = set()
|
||||||
|
for cc in cols:
|
||||||
|
ViewModifierCodeSequence_CodeMeaning |= set(df_allheaders[cc].dropna().unique())
|
||||||
|
|
||||||
|
for vv in (True, False):
|
||||||
|
if (vv in ViewModifierCodeSequence_CodeMeaning):
|
||||||
|
ViewModifierCodeSequence_CodeMeaning.remove(vv)
|
||||||
|
|
||||||
|
ViewModifierCodeSequence_CodeMeaning = dict(zip(
|
||||||
|
ViewModifierCodeSequence_CodeMeaning,
|
||||||
|
[None]*len(ViewModifierCodeSequence_CodeMeaning)))
|
||||||
|
|
||||||
|
for kk in ViewModifierCodeSequence_CodeMeaning.keys():
|
||||||
|
ViewModifierCodeSequence_CodeMeaning[kk] = df_allheaders[cols[0]].copy()
|
||||||
|
ViewModifierCodeSequence_CodeMeaning[kk][:] = False
|
||||||
|
ViewModifierCodeSequence_CodeMeaning[kk] = \
|
||||||
|
ViewModifierCodeSequence_CodeMeaning[kk].astype(bool)
|
||||||
|
for cc in cols:
|
||||||
|
ViewModifierCodeSequence_CodeMeaning[kk] |= df_allheaders[cc].map(lambda x: kk in x if type(x) is str else False)
|
||||||
|
|
||||||
|
|
||||||
|
ViewModifierCodeSequence_CodeMeaning = pd.DataFrame(ViewModifierCodeSequence_CodeMeaning)
|
||||||
|
ViewModifierCodeSequence_CodeMeaning.columns = \
|
||||||
|
ViewModifierCodeSequence_CodeMeaning.columns.map(lambda x: colprefix + "_" + x.replace(" ",""))
|
||||||
|
|
||||||
|
for cc in cols:
|
||||||
|
df_allheaders.drop(cc, axis=1, inplace=True)
|
||||||
|
df_allheaders = pd.concat([df_allheaders, ViewModifierCodeSequence_CodeMeaning], axis=1)
|
||||||
|
return df_allheaders
|
||||||
|
|
||||||
|
#############################33
|
||||||
|
def normalize_fields(df_allheaders):
|
||||||
|
# ## Clean up
|
||||||
|
# ### PixelSpacing
|
||||||
|
if "PatientAge" in df_allheaders.columns:
|
||||||
|
df_allheaders.PatientAge = df_allheaders.PatientAge.map(lambda x: int(x.lower().rstrip('y')))
|
||||||
|
if "DetectorActiveDimensions" in df_allheaders.columns:
|
||||||
|
df_allheaders.DetectorActiveDimensions = df_allheaders.DetectorActiveDimensions.map(parse_float_tuples_prod)
|
||||||
|
#df_allheaders.DetectorActiveDimensions = list(map(parse_float_tuples_prod,
|
||||||
|
# df_allheaders.DetectorActiveDimensions.tolist()))
|
||||||
|
|
||||||
|
if "PixelSpacing" in df_allheaders.columns:
|
||||||
|
df_allheaders.PixelSpacing = df_allheaders["PixelSpacing"].map(format_PixelSpacing)
|
||||||
|
if "ImagerPixelSpacing" in df_allheaders.columns:
|
||||||
|
df_allheaders.ImagerPixelSpacing = df_allheaders["ImagerPixelSpacing"].map(format_PixelSpacing)
|
||||||
|
if "ModalitiesInStudy" in df_allheaders.columns:
|
||||||
|
df_allheaders["ModalitiesInStudy"] = df_allheaders["ModalitiesInStudy"].map(lambda x: "mg" in str(x))
|
||||||
|
if "HalfValueLayer" in df_allheaders.columns:
|
||||||
|
df_allheaders["HalfValueLayer"] = df_allheaders["HalfValueLayer"].map(lambda x: x if type(x) is float else float(str(x).replace('b','').replace("'", '')))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ### FieldOfViewDimensions
|
||||||
|
# computing area and filling in the gaps with the mode **worsens** the FNR
|
||||||
|
|
||||||
|
# df_allheaders['FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].map(lambda x: np.prod([int(y) for y in eval(x)]) if type(x) is str else x)
|
||||||
|
# df_allheaders.loc[df_allheaders['FieldOfViewDimensions'].isnull(), 'FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].value_counts().argmax()
|
||||||
|
|
||||||
|
|
||||||
|
# df_allheaders["PartialView"].map(lambda x: type(x)).value_counts()
|
||||||
|
if "ViewPosition" in df_allheaders.columns:
|
||||||
|
df_allheaders["ViewPosition"] = df_allheaders["ViewPosition"].map(lambda x: x in ['cc', 'mlo'])
|
||||||
|
|
||||||
|
df_allheaders = extract_list_text_field(df_allheaders,
|
||||||
|
colprefix = "ViewModifierCodeSequence_CodeMeaning")
|
||||||
|
|
||||||
|
#df_allheaders = extract_list_text_field(df_allheaders,
|
||||||
|
# colprefix = "ViewModifierCodeSequence_CodeMeaning")
|
||||||
|
|
||||||
|
# ### BreastImplantPresent
|
||||||
|
# #### clean up
|
||||||
|
if "BreastImplantPresent" in df_allheaders.columns:
|
||||||
|
# BreastImplantPresent = pd.Series([np.nan]*df_allheaders.shape[0])
|
||||||
|
#BreastImplantPresent = pd.Series([False]*df_allheaders.shape[0])
|
||||||
|
#BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)] = True
|
||||||
|
BreastImplantPresent = df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)
|
||||||
|
# BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "no" in x)] = False
|
||||||
|
df_allheaders['BreastImplantPresent'] = BreastImplantPresent
|
||||||
|
del BreastImplantPresent
|
||||||
|
if "PartialView" in df_allheaders:
|
||||||
|
df_allheaders["PartialView"] = df_allheaders["PartialView"].map(lambda x : "yes" in x if type(x) is str else False)
|
||||||
|
|
||||||
|
for kk in ["WindowWidth", "WindowCenter"]:
|
||||||
|
if kk in df_allheaders.columns:
|
||||||
|
df_allheaders[kk] = df_allheaders[kk].map(parse_int_tuples_median)
|
||||||
|
|
||||||
|
if "PatientOrientation" in df_allheaders.columns:
|
||||||
|
df_allheaders.PatientOrientation = df_allheaders.PatientOrientation.map(parse_str_tuples)
|
||||||
|
if "DetectorElementPhysicalSize" in df_allheaders.columns:
|
||||||
|
df_allheaders["DetectorElementPhysicalSize"] = df_allheaders.DetectorElementPhysicalSize.map(parse_float_tuples)
|
||||||
|
# ### Grid
|
||||||
|
# df_allheaders["Grid"].value_counts()
|
||||||
|
if "Grid" in df_allheaders.columns:
|
||||||
|
df_allheaders["Grid"] = (df_allheaders["Grid"]
|
||||||
|
.map(str)
|
||||||
|
.map(lambda x: x.replace('(','')
|
||||||
|
.replace(')','')
|
||||||
|
.replace("'","")
|
||||||
|
.replace(',','')
|
||||||
|
.replace("parrallel", "parallel")))
|
||||||
|
|
||||||
|
df_allheaders.loc[df_allheaders["Grid"] == "('reciprocating', 'parrallel')", "Grid"] = "('reciprocating', 'parallel')"
|
||||||
|
df_allheaders["Grid"].value_counts()
|
||||||
|
# df_allheaders.PixelSpacing = df_allheaders.PixelSpacing.astype(str)
|
||||||
|
# df_allheaders.PixelSpacing.value_counts()
|
||||||
|
if "FieldOfViewOrigin" in df_allheaders.columns:
|
||||||
|
df_allheaders["FieldOfViewOrigin_x"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 0))
|
||||||
|
df_allheaders["FieldOfViewOrigin_y"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 1))
|
||||||
|
df_allheaders.drop("FieldOfViewOrigin", axis=1, inplace=True)
|
||||||
|
|
||||||
|
#informative_cols.remove("FieldOfViewOrigin")
|
||||||
|
#informative_cols.append("FieldOfViewOrigin_x")
|
||||||
|
#informative_cols.append("FieldOfViewOrigin_y")
|
||||||
|
if "FocalSpots" in df_allheaders.columns:
|
||||||
|
df_allheaders.loc[df_allheaders["FocalSpots"].isnull(), "FocalSpots"] = df_allheaders["FocalSpots"].value_counts().argmax()
|
||||||
|
for kk in ["PixelSpacing", "EstimatedRadiographicMagnificationFactor", "XRayTubeCurrent", "DistanceSourceToPatient"]:
|
||||||
|
# print(kk)
|
||||||
|
if kk in df_allheaders.columns:
|
||||||
|
df_allheaders.loc[df_allheaders[kk].isnull(), kk] = df_allheaders[kk].median()
|
||||||
|
if "ImageType" in df_allheaders.columns:
|
||||||
|
keywords = set(chain(*(df_allheaders.ImageType.map(lambda x: parse_str_tuples(x)).tolist())))
|
||||||
|
keywords.remove("")
|
||||||
|
for kk in keywords:
|
||||||
|
key = "ImageType"+"_"+kk
|
||||||
|
df_allheaders[key] = df_allheaders.ImageType.map(lambda x: kk in x)
|
||||||
|
df_allheaders.drop("ImageType", axis=1, inplace=True)
|
||||||
|
|
||||||
|
return df_allheaders
|
||||||
|
|
||||||
|
|
||||||
|
def move_digits_back(allcolumns):
|
||||||
|
allcolumns = list(allcolumns)
|
||||||
|
for nn, x in enumerate(allcolumns):
|
||||||
|
if x[0] in set(list('0123456789')):
|
||||||
|
x = "_".join(x.split("_")[1:] + x.split("_")[:1])
|
||||||
|
allcolumns[nn] = x
|
||||||
|
return allcolumns
|
||||||
|
|
||||||
|
def get_features(df_allheaders, thr_stderr = 1e-6):
|
||||||
|
# df_allheaders.columns = move_digits_back(df_allheaders.columns)
|
||||||
|
|
||||||
|
df_allheaders = normalize_fields(df_allheaders.copy())
|
||||||
|
text_fields = select_text_fields(df_allheaders)
|
||||||
|
# df_allheaders[text_fields].apply(entropy).hist()
|
||||||
|
|
||||||
|
if thr_stderr >0:
|
||||||
|
field_list = get_good_numeric_fields(df_allheaders,thr_stderr=thr_stderr)
|
||||||
|
field_list = list(set(clean_up_field_list(field_list + text_fields)))
|
||||||
|
|
||||||
|
df_allheaders = make_lowercase_text_fields(df_allheaders)
|
||||||
|
|
||||||
|
# pd.crosstab(df_allheaders['0_ViewCodeSequence_CodeMeaning'], df_allheaders['ViewPosition'])
|
||||||
|
# informative_cols = ['Filename', 'AccessionNumber','BreastImplantPresent','DistanceSourceToPatient','EstimatedRadiographicMagnificationFactor',
|
||||||
|
# 'FocalSpots','Grid','PixelSpacing','XRayTubeCurrent', 'ViewPosition', 'PartialView']
|
||||||
|
|
||||||
|
informative_cols = ['Filename', 'AccessionNumber'] + field_list
|
||||||
|
|
||||||
|
feature_columns = informative_cols[2:]
|
||||||
|
|
||||||
|
noncategorical = ['ContentTime',
|
||||||
|
'FieldOfViewOrigin_x',
|
||||||
|
'FieldOfViewOrigin_y',
|
||||||
|
'HalfValueLayer',
|
||||||
|
'WindowWidth',
|
||||||
|
'CompressionForce',
|
||||||
|
'DetectorActiveDimensions',
|
||||||
|
'RelativeXRayExposure',
|
||||||
|
'ExposureTime',
|
||||||
|
'Exposure',
|
||||||
|
'BodyPartThickness',
|
||||||
|
'FieldOfViewOrigin_y',
|
||||||
|
'CollimatorLowerHorizontalEdge',
|
||||||
|
'WindowCenter',
|
||||||
|
'FieldOfViewRotation',
|
||||||
|
'KVP',
|
||||||
|
'DistanceSourceToDetector',
|
||||||
|
'DistanceSourceToEntrance',
|
||||||
|
'CollimatorLeftVerticalEdge',
|
||||||
|
'DetectorTemperature',
|
||||||
|
'HighBit']
|
||||||
|
categorical = ['Manufacturer',
|
||||||
|
'ManufacturerModelName',
|
||||||
|
'Grid_htc',
|
||||||
|
'ViewModifierCodeSequence_CodeMeaning',
|
||||||
|
'ViewModifierCodeSequence_CodeMeaning']
|
||||||
|
|
||||||
|
noncategorical = list(set(feature_columns) & set(noncategorical))
|
||||||
|
potentially_categorical = (set(feature_columns) - set(noncategorical))
|
||||||
|
potentially_categorical |= set(categorical) & set(df_allheaders.columns)
|
||||||
|
potentially_categorical = list(potentially_categorical)
|
||||||
|
print("potentially_categorical", len(potentially_categorical))
|
||||||
|
print("non_categorical", len(noncategorical))
|
||||||
|
for cc in noncategorical:
|
||||||
|
if str(df_allheaders[cc].dtype) == 'object':
|
||||||
|
df_allheaders[cc] = df_allheaders[cc].map(parse_float).astype(float)
|
||||||
|
if len(potentially_categorical)>0:
|
||||||
|
df_allheaders[potentially_categorical] = df_allheaders[potentially_categorical].fillna('unknown')
|
||||||
|
features_onehot = pd.get_dummies(df_allheaders[potentially_categorical],
|
||||||
|
drop_first=True, prefix_sep='=')
|
||||||
|
features_onehot = pd.concat([features_onehot, df_allheaders[noncategorical]], axis=1)
|
||||||
|
else:
|
||||||
|
print("no features to binarise!")
|
||||||
|
features_onehot = df_allheaders[non_categorical].copy()
|
||||||
|
|
||||||
|
#features_onehot = pd.concat([df_allheaders.Filename, features_onehot],axis=1,).set_index("Filename")
|
||||||
|
|
||||||
|
features_onehot.shape, features_onehot.dropna().shape
|
||||||
|
|
||||||
|
# ### Map DICOM file name to PNG file name (remove directories)
|
||||||
|
#features_onehot.index = features_onehot.index.map(lambda x: "_".join(x.split("/")[-4:]).replace(".dcm", ".png")).tolist()
|
||||||
|
for cc in features_onehot.columns[features_onehot.isnull().any()]:
|
||||||
|
print("filling in with median:\t%s" % cc)
|
||||||
|
features_onehot.loc[features_onehot[cc].isnull(),cc] = \
|
||||||
|
features_onehot[cc].median()
|
||||||
|
features_onehot = features_onehot.loc[:,~features_onehot.isnull().any()]
|
||||||
|
|
||||||
|
onehotcols = np.asarray(features_onehot.columns[features_onehot.dtypes.map(lambda x : x is pd.np.dtype("uint8"))].tolist())
|
||||||
|
thr_frac = 0.01
|
||||||
|
bad_feature_cols = onehotcols[(features_onehot[onehotcols].sum(0) < 5) |
|
||||||
|
(features_onehot[onehotcols].mean(0) < thr_frac) |
|
||||||
|
(features_onehot[onehotcols].mean(0) > (1-thr_frac))]
|
||||||
|
len(bad_feature_cols)
|
||||||
|
features_onehot.drop(bad_feature_cols, axis=1, inplace=True)
|
||||||
|
if "FocalSpots" in features_onehot:
|
||||||
|
features_onehot.loc[features_onehot["FocalSpots"].isnull(), "FocalSpots"] = \
|
||||||
|
features_onehot["FocalSpots"].value_counts().argmax()
|
||||||
|
|
||||||
|
return features_onehot
|
||||||
|
|
||||||
|
|
||||||
|
#############################
|
||||||
|
if __name__ == '__main__':
|
||||||
|
PREFIX="allfeatures"
|
||||||
|
|
||||||
|
# !sudo pip3 install dicom
|
||||||
|
# # read a table of DICOM headers
|
||||||
|
filelist_fn = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/filelist_nonscreening_4000_seed42.csv'
|
||||||
|
outpath = os.path.join(os.path.dirname(filelist_fn), "dicom_headers_all_fields_" + os.path.basename(filelist_fn))
|
||||||
|
print(outpath)
|
||||||
|
df_allheaders = pd.read_csv(outpath, index_col=0)
|
||||||
|
features_onehot = get_features(df_allheaders)
|
||||||
|
|
||||||
|
# ## Read labels
|
||||||
|
fn_man_labels = "/data/dlituiev/tables/cleaned_manual_labels_valset_4000.txt"
|
||||||
|
df = pd.read_table(fn_man_labels, index_col=0)
|
||||||
|
df.index = df.index.map(lambda x : x.split("/")[-1])
|
||||||
|
|
||||||
|
# process labels
|
||||||
|
df["special_view"] = df["regular_view"].map(lambda x: not x)
|
||||||
|
|
||||||
|
|
||||||
|
dfm = pd.merge(df[["special_view"]], features_onehot, how='left', left_index=True, right_index=True)
|
||||||
|
dfm.shape
|
||||||
|
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from statsmodels.graphics.mosaicplot import mosaic
|
||||||
|
plt.matplotlib.rcParams["hatch.color"] = [0.7]*3
|
||||||
|
|
||||||
|
dfm.var()
|
||||||
|
dfm.isnull().sum()
|
||||||
|
dfm.plot(x='special_view', y='XRayTubeCurrent', kind='scatter', alpha=0.05)
|
||||||
|
dfm.plot(x='special_view', y='DistanceSourceToPatient', kind='scatter', alpha=0.05)
|
||||||
|
dfm["special_view"].isnull().sum()
|
||||||
|
|
||||||
|
|
||||||
|
target = dfm["special_view"]
|
||||||
|
features = dfm.drop("special_view", axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.utils import shuffle
|
||||||
|
# for building and visualizing the decision tree
|
||||||
|
from sklearn.naive_bayes import GaussianNB, BernoulliNB
|
||||||
|
# from sklearn.svm import SVC
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
|
||||||
|
# visualization
|
||||||
|
from vis_tree import visualize_tree
|
||||||
|
from sklearn.model_selection import train_test_split, cross_val_score
|
||||||
|
from sklearn.metrics import (accuracy_score, auc, confusion_matrix, f1_score,
|
||||||
|
precision_score, roc_curve, precision_recall_curve)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
y_dev, y_val, X_dev, X_val = train_test_split(target, features, random_state=0, test_size=1/6)
|
||||||
|
|
||||||
|
y_tr, y_ts, X_tr, X_ts = train_test_split(y_dev, X_dev, random_state=0, test_size=1/5)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# dtree = DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, criterion="entropy")
|
||||||
|
# dtree = RandomForestClassifier(min_samples_split=10, min_samples_leaf=5)
|
||||||
|
# dtree = AdaBoostClassifier(base_estimator=dtree, n_estimators=60, learning_rate=0.01)
|
||||||
|
# dtree = AdaBoostClassifier(base_estimator=GaussianNB(), n_estimators=50, learning_rate=0.01)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dtree = GradientBoostingClassifier(max_depth=8, n_estimators=40, learning_rate=0.05, min_samples_leaf=12)
|
||||||
|
modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dtree.fit(X_tr, y_tr)
|
||||||
|
pred_y_ts = dtree.predict(X_ts)
|
||||||
|
pred_yscore_ts = dtree.predict_proba(X_ts)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
get_ipython().magic('pinfo auc')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pr_, rec_, thresholds = precision_recall_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
|
||||||
|
# auc_pr = auc(pr_, rec_)
|
||||||
|
|
||||||
|
plt.plot(pr_, rec_)
|
||||||
|
plt.xlabel('Precision')
|
||||||
|
plt.ylabel('Recall')
|
||||||
|
# plt.title('auPRC = {0:.2f}%'.format(auc_pr))
|
||||||
|
plt.xlim([0,1])
|
||||||
|
plt.ylim([0,1])
|
||||||
|
plt.axis('equal')
|
||||||
|
plt.axis('square')
|
||||||
|
|
||||||
|
print("%.2f" % (100*auc_))
|
||||||
|
frmt = 'png'
|
||||||
|
plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
fpr_, tpr_, thresholds = roc_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
|
||||||
|
auc_ = auc(fnr_, tpr_)
|
||||||
|
|
||||||
|
plt.plot(fpr_, tpr_)
|
||||||
|
plt.xlabel('False Positive Rate')
|
||||||
|
plt.ylabel('True Positive Rate')
|
||||||
|
plt.title('AUC = {0:.2f}%'.format(auc_))
|
||||||
|
plt.axis('equal')
|
||||||
|
plt.axis('square')
|
||||||
|
|
||||||
|
print("%.2f" % (100*auc_))
|
||||||
|
frmt = 'png'
|
||||||
|
plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# pd.DataFrame(dict(FNR=fnr_, TPR=tpr_, threshold=thresholds))
|
||||||
|
features.plot(x="EstimatedRadiographicMagnificationFactor", y="PixelSpacing", kind='scatter')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
fig,ax = plt.subplots(1, figsize=(6,14))
|
||||||
|
feat_imp = pd.Series(dtree.feature_importances_, index=features.columns)
|
||||||
|
feat_imp = feat_imp[feat_imp>0.0].sort_values()[::-1]
|
||||||
|
feat_imp[::-1].plot(kind='barh', ax=ax)
|
||||||
|
print(feat_imp)
|
||||||
|
# plt.xlim([0,0.5])
|
||||||
|
# plt.tight_layout()
|
||||||
|
frmt = 'png'
|
||||||
|
plt.savefig("{}_{}_feature_importances.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
len(thresholds)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# pd.DataFrame(dict(
|
||||||
|
# FNR=fnr_,
|
||||||
|
# TPR=tpr_,
|
||||||
|
# threshold = thresholds))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
|
||||||
|
df_confusion
|
||||||
|
confusion_matrix(y_ts, pred_y_ts)
|
||||||
|
cm = confusion_matrix(y_ts, pred_y_ts)
|
||||||
|
cm[1,0]/cm[1,:].sum()
|
||||||
|
def fnr(dtree, X_val, y_val, thr = None):
|
||||||
|
if not thr:
|
||||||
|
pred_y_val = dtree.predict(X_val)
|
||||||
|
else:
|
||||||
|
pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
|
||||||
|
# df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
|
||||||
|
# pd.Series(pred_y_val, name="predicted"))
|
||||||
|
# out = df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
|
||||||
|
|
||||||
|
cm = confusion_matrix(y_val, pred_y_val)
|
||||||
|
out = cm[1,0]/cm[1,:].sum()
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fpr(dtree, X_val, y_val, thr = None):
|
||||||
|
if not thr:
|
||||||
|
pred_y_val = dtree.predict(X_val)
|
||||||
|
else:
|
||||||
|
pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
|
||||||
|
# df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
|
||||||
|
# pd.Series(pred_y_val, name="predicted"))
|
||||||
|
# out = df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
|
||||||
|
|
||||||
|
|
||||||
|
cm = confusion_matrix(y_val, pred_y_val)
|
||||||
|
if cm[0,:].sum() !=0:
|
||||||
|
out = cm[0,1]/cm[0,:].sum()
|
||||||
|
else:
|
||||||
|
out = 0.0
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
THR = 0.15
|
||||||
|
|
||||||
|
|
||||||
|
# True | False
|
||||||
|
# True TP | FN
|
||||||
|
# False FP | TN
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# FPR = FP / (FP + TN)
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pred_y_ts = dtree.predict_proba(X_ts)[:,1] > THR
|
||||||
|
df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
|
||||||
|
print(df_confusion.to_csv(sep='|'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
THR = 0.05
|
||||||
|
|
||||||
|
modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
|
||||||
|
cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
|
||||||
|
cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
|
||||||
|
|
||||||
|
tmpstr = """model: {}
|
||||||
|
threshold = {}
|
||||||
|
+ on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
|
||||||
|
+ in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
|
||||||
|
modelname, THR,
|
||||||
|
100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
|
||||||
|
100*cv_fnr.mean(), 100*cv_fpr.mean())
|
||||||
|
print(tmpstr)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
THR = 0.5
|
||||||
|
modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
|
||||||
|
cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
|
||||||
|
cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
|
||||||
|
|
||||||
|
tmpstr = """model: {}
|
||||||
|
threshold = {}
|
||||||
|
+ on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
|
||||||
|
+ in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
|
||||||
|
modelname, THR,
|
||||||
|
100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
|
||||||
|
100*cv_fnr.mean(), 100*cv_fpr.mean())
|
||||||
|
print(tmpstr)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
6/72
|
||||||
|
|
||||||
|
|
||||||
|
# ## fnr
|
||||||
|
# 0.1443 -- AdaBoostClassifier(50, lr=0.1) with:
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# DecisionTreeClassifier(max_depth=7, min_samples_leaf=5, criterion="entropy")
|
||||||
|
# GaussianNB()
|
||||||
|
#
|
||||||
|
# 0.1134 -- AdaBoostClassifier(50, lr=0.01) with:
|
||||||
|
# GaussianNB()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
accuracy_score(y_true=y_val, y_pred=pred_y_val)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
f1_score(y_true=y_val, y_pred=pred_y_val)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
confusion_matrix(y_true=y_val, y_pred=pred_y_val)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df_confusion = pd.crosstab(pd.Series(y_val.as_matrix(), name="observed"),
|
||||||
|
pd.Series(pred_yscore_dev[:,1]>0.15, name="predicted"))
|
||||||
|
df_confusion
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
109/(385+109)
|
||||||
|
|
||||||
|
|
||||||
|
# ## Misclassified: examples and comments
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# pred_false = (pd.Series(pred_y_val, name="predicted")==False)
|
||||||
|
pred_false = (pd.Series(pred_yscore_dev[:,1]<0.15, name="predicted")==False)
|
||||||
|
false_negatives = (pd.Series(y_val.as_matrix(), name="observed")) & pred_false
|
||||||
|
false_negatives.index=y_val.index
|
||||||
|
false_negatives.shape, df.shape
|
||||||
|
# y_val[false_negatives.tolist()].shape
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
xstr = """1805162996_1.2.840.113654.2.70.1.75424722723272471565664976911416714890_2_37.png -- implant?
|
||||||
|
1433463766_1.2.840.113654.2.70.1.243422935316700791950696878743366703411_6_6.png -- male?
|
||||||
|
3395322213_1.2.840.113654.2.70.1.161905211577383187509354224390811944382_1161_7.png -- overexposed with scale grid
|
||||||
|
1383662805_1.2.840.113654.2.70.1.194667288082835549565211946781626641146_1_88.png -- mag? bars in the image
|
||||||
|
5717508670_1.2.840.113654.2.70.1.135196805563780165444562848954663016070_2_6.png -- spot
|
||||||
|
1582554801_1.2.840.113654.2.70.1.202883517655342643705007475928329105895_1_1.png -- strange shape; plate
|
||||||
|
3248534628_1.2.840.113654.2.70.1.153327658320065917717726871735320153117_14_8.png -- RLMID, implant
|
||||||
|
1050998385_1.2.840.113654.2.70.1.294672228525412928579179278566440354700_168_12.png -- RMLO, underexposed, plate
|
||||||
|
2431514667_1.2.840.113654.2.70.1.132697486450403983700631264913146412468_1_1.png -- regular CC
|
||||||
|
2836025574_1.2.840.113654.2.70.1.94728406891527814842052605970255602447_31728_4.png -- regular CC, wire?
|
||||||
|
2774547752_1.2.840.113654.2.70.1.152335331945150793610356395498084601027_47428_6.png -- poor exposure?
|
||||||
|
6784971236_1.2.840.113654.2.70.1.276140387730485551768768734852859745761_21705_2.png -- regular CC
|
||||||
|
6120027884_1.2.840.113654.2.70.1.202389441802705593488291262945242015864_28128_3.png -- spot
|
||||||
|
2127109953_1.2.840.113654.2.70.1.136443797025605972119376095795980286524_5_26.png -- RML, scar
|
||||||
|
5015120217_1.2.840.113654.2.70.1.8576402180164318136049174781190805706_19615_3.png -- regular MLO, underexposure
|
||||||
|
2915273528_1.2.840.113654.2.70.1.50904067248781976561131370015339684052_3_51.png -- RLM
|
||||||
|
2859796079_1.2.840.113654.2.70.1.248757700026158935826319533755178408586_3_51.png -- LMLO, scar""".split("\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df_misclassified_comments = pd.DataFrame([x.split(" -- ") for x in xstr], columns=["Filename", "comment"]).applymap(lambda x: x.rstrip().lstrip()).set_index("Filename")["comment"]
|
||||||
|
df_misclassified_comments
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
X_val.columns
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# X_val[false_negatives][['ViewPosition_ccid', 'ViewPosition_lm', 'ViewPosition_lmid',
|
||||||
|
# 'ViewPosition_ml', 'ViewPosition_mlo', 'ViewPosition_mloid',
|
||||||
|
# 'ViewPosition_xccl', "FieldOfViewDimensions_('145', '105')"]]
|
||||||
|
|
||||||
|
X_val[false_negatives][['ViewPosition',
|
||||||
|
'ViewModifierCodeSequence']]
|
||||||
|
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import sys
|
||||||
|
from header_cleaner import get_features, normalize_fields, parse_float_tuples, parse_float
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
fn_features = "../tables/df_all_mammos_dicom_headers_selected.tab.gz"
|
||||||
|
outfn = "../tables/df_all_mammos_dicom_headers_selected_norm.tab"
|
||||||
|
|
||||||
|
dffeatures = pd.read_table(fn_features, index_col="filename")
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
mask_nonnumeric = ~dffeatures["ContentTime"].map(lambda x: isinstance(x, float) | isinstance(x, int))
|
||||||
|
dffeatures.loc[mask_nonnumeric, "ContentTime"] = dffeatures["ContentTime"][mask_nonnumeric].map(lambda x: float(x.replace(':','').replace('--',"30")))
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
print("shape", dffeatures.shape)
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
normalize_fun = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning":
|
||||||
|
lambda x: str(x).lower(),
|
||||||
|
"0_ViewCodeSequence_CodeValue": lambda x: str(x),
|
||||||
|
"Grid": lambda x: str(x).replace("'","")
|
||||||
|
.replace("(","").replace(")","")
|
||||||
|
.replace(",","").replace("/"," ")
|
||||||
|
.replace('PARRALLEL',"PARALLEL")
|
||||||
|
.lower(),
|
||||||
|
"HighBit": lambda x: str(int(x)) if (isinstance(x, float) and x*1==x) else str(x),
|
||||||
|
"WindowCenter": lambda x: np.median(parse_float_tuples(x)),
|
||||||
|
"FieldOfViewOrigin":parse_float_tuples,
|
||||||
|
"EstimatedRadiographicMagnificationFactor": lambda x: x,
|
||||||
|
"ContentTime": lambda x: x,
|
||||||
|
"FieldOfViewRotation": lambda x: float(parse_float(x)),
|
||||||
|
"KVP": lambda x: float(parse_float(x)),
|
||||||
|
"ShutterLowerHorizontalEdge": lambda x: float(parse_float(x)),
|
||||||
|
"ShutterRightVerticalEdge": lambda x: float(parse_float(x)),
|
||||||
|
"XRayTubeCurrentInuA": lambda x: float(parse_float(x)),
|
||||||
|
"RelativeXRayExposure": lambda x: float(parse_float(x)),
|
||||||
|
"ManufacturerModelName": lambda x: str(x).lower().replace('"',''),
|
||||||
|
"Manufacturer": lambda x: str(x).lower().replace('"','').replace(',', '').replace(" inc", "").rstrip('.'),
|
||||||
|
"BodyPartThickness":lambda x: float(parse_float(x)),
|
||||||
|
"CollimatorLeftVerticalEdge": lambda x: float(parse_float(x)),
|
||||||
|
"CollimatorLowerHorizontalEdge": lambda x: float(parse_float(x)),
|
||||||
|
"DetectorActiveDimensions" : lambda x: parse_float_tuples(x.replace("\\", ", ") if isinstance(x, str) else x),
|
||||||
|
"ExposureTime": lambda x: x,
|
||||||
|
"ExposuresOnDetectorSinceLastCalibration": lambda x: x,
|
||||||
|
"ExposuresOnDetectorSinceManufactured": lambda x: x,
|
||||||
|
"DistanceSourceToEntrance": lambda x: x,
|
||||||
|
"DetectorTemperature":lambda x: float(parse_float(x)),
|
||||||
|
"DistanceSourceToDetector": lambda x: x,
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
dtypes = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning": str,
|
||||||
|
"0_ViewCodeSequence_CodeValue": str,
|
||||||
|
"Grid": str,
|
||||||
|
"HighBit": str, # int
|
||||||
|
"WindowCenter": int,
|
||||||
|
"FieldOfViewOrigin": 'O',
|
||||||
|
"EstimatedRadiographicMagnificationFactor": float,
|
||||||
|
"ContentTime": float, #NaN
|
||||||
|
"FieldOfViewRotation": float,
|
||||||
|
"KVP": float,
|
||||||
|
"ShutterLowerHorizontalEdge": float,
|
||||||
|
"ShutterRightVerticalEdge": float,
|
||||||
|
"XRayTubeCurrentInuA": float,
|
||||||
|
"RelativeXRayExposure": float,
|
||||||
|
"ManufacturerModelName": str,
|
||||||
|
"Manufacturer": str,
|
||||||
|
"BodyPartThickness": float,
|
||||||
|
"CollimatorLeftVerticalEdge": float,
|
||||||
|
"CollimatorLowerHorizontalEdge": float,
|
||||||
|
"DetectorActiveDimensions" : 'O',
|
||||||
|
"ExposureTime": float,
|
||||||
|
"ExposuresOnDetectorSinceLastCalibration": float, # NaNs
|
||||||
|
"ExposuresOnDetectorSinceManufactured": float, # NaNs
|
||||||
|
"DistanceSourceToEntrance": float,
|
||||||
|
"DetectorTemperature": float,
|
||||||
|
"DistanceSourceToDetector": float,
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
|
||||||
|
set(dffeatures.columns) - set(normalize_fun.keys())
|
||||||
|
|
||||||
|
#cell#
|
||||||
|
|
||||||
|
for kk, vv in dffeatures.items():
|
||||||
|
print(kk)
|
||||||
|
dffeatures.loc[:,kk] = vv.map(normalize_fun[kk]).astype(dtypes[kk])
|
||||||
|
|
||||||
|
dffeatures.to_csv(outfn, sep='\t', compression='gzip')
|
||||||
@@ -0,0 +1,48 @@
|
|||||||
|
ReduceLROnPlateau:
|
||||||
|
cooldown: 32
|
||||||
|
epsilon: 0.001
|
||||||
|
factor: 0.5
|
||||||
|
min_lr: 1.0e-08
|
||||||
|
mode: auto
|
||||||
|
monitor: val_loss
|
||||||
|
patience: 32
|
||||||
|
verbose: 0
|
||||||
|
base_trainable: true
|
||||||
|
batch_size: 256
|
||||||
|
class_mode: binary
|
||||||
|
class_weights: null
|
||||||
|
classes:
|
||||||
|
- normal
|
||||||
|
- special
|
||||||
|
contrast: null
|
||||||
|
data_augmentation: true
|
||||||
|
data_train: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train
|
||||||
|
data_val: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test
|
||||||
|
dropout: 0.5
|
||||||
|
fill_mode: reflect
|
||||||
|
final_activation: sigmoid
|
||||||
|
height_shift_range: 0.125
|
||||||
|
horizontal_flip: true
|
||||||
|
init_epoch: 0
|
||||||
|
loss_weights: null
|
||||||
|
lr: 0.0001
|
||||||
|
n_classes: 1
|
||||||
|
nb_epoch: 500
|
||||||
|
ndense: 0
|
||||||
|
oversampling: false
|
||||||
|
pretrained: true
|
||||||
|
rotation_range: 15
|
||||||
|
samplewise_center: false
|
||||||
|
seed: 2
|
||||||
|
target_side: 99
|
||||||
|
target_size:
|
||||||
|
- 99
|
||||||
|
- 99
|
||||||
|
truncate_quantile: null
|
||||||
|
vertical_flip: false
|
||||||
|
weightfile: null
|
||||||
|
width_shift_range: 0.125
|
||||||
|
zoom_range:
|
||||||
|
- 0.8
|
||||||
|
- 1.2
|
||||||
|
ztransform: false
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
../inception_short.py
|
||||||
@@ -0,0 +1,185 @@
|
|||||||
|
|
||||||
|
# coding: utf-8
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
sys.path.append('../..')
|
||||||
|
|
||||||
|
from inception_short import get_model, get_num_files, get_class_weights
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
from image import ImageDataGenerator
|
||||||
|
# from keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from keras.models import load_model
|
||||||
|
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
|
||||||
|
from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
|
||||||
|
from shutil import copy2
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
class AttrDict(dict):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AttrDict, self).__init__(*args, **kwargs)
|
||||||
|
self.__dict__ = self
|
||||||
|
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import numpy as np
|
||||||
|
import keras
|
||||||
|
from hashlib import md5
|
||||||
|
os.environ["PYTHONHASHSEED"]='0'
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
|
||||||
|
|
||||||
|
if os.environ["CUDA_VISIBLE_DEVICES"] == '':
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
|
||||||
|
|
||||||
|
|
||||||
|
indir = "./"
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
with open(os.path.join(indir, "checkpoint.info")) as chkpt_fh:
|
||||||
|
prms = AttrDict(yaml.load(chkpt_fh))
|
||||||
|
print("\n".join(["%s\t%s" %(kk,vv) for kk,vv in prms.items()]),)
|
||||||
|
|
||||||
|
weightfile = os.environ["WFILE"]
|
||||||
|
#weightfile = "model.175-0.068012.hdf5"
|
||||||
|
prms['weightfile'] = weightfile
|
||||||
|
prms['weightfile'] = os.path.join(indir, prms['weightfile'])
|
||||||
|
prms['weightfile']
|
||||||
|
|
||||||
|
|
||||||
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
|
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
|
||||||
|
print("loss:", prms["loss"])
|
||||||
|
|
||||||
|
# CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
|
||||||
|
|
||||||
|
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
|
||||||
|
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
#########################################
|
||||||
|
|
||||||
|
if prms.weightfile:
|
||||||
|
print("LOADING WEIGHTS FROM:\t%s" % prms.weightfile)
|
||||||
|
# model.load_weights(prms.weightfile)
|
||||||
|
model = load_model(prms.weightfile)
|
||||||
|
|
||||||
|
|
||||||
|
# In[22]:
|
||||||
|
|
||||||
|
|
||||||
|
flowfromdir_params = dict(
|
||||||
|
# color_mode = "grayscale",
|
||||||
|
target_size=prms.target_size,
|
||||||
|
batch_size=prms.batch_size,
|
||||||
|
class_mode=prms.class_mode,
|
||||||
|
classes=prms.classes,
|
||||||
|
seed=prms.seed)
|
||||||
|
|
||||||
|
norm_params = dict(
|
||||||
|
#rescale=prms.scaleup,
|
||||||
|
samplewise_center=prms.samplewise_center,
|
||||||
|
samplewise_std_normalization=prms.samplewise_center,
|
||||||
|
featurewise_center=False,
|
||||||
|
featurewise_std_normalization=False,
|
||||||
|
zca_whitening=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# In[23]:
|
||||||
|
|
||||||
|
|
||||||
|
train_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
|
||||||
|
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]#*2**-8
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
#stratify = prms.oversampling,
|
||||||
|
#sampling_factor=prms.sampling_factor,
|
||||||
|
#oversampling=prms.oversampling,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
|
||||||
|
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
def get_predictions(data_dir,
|
||||||
|
preprocessing_function = lambda x:x,
|
||||||
|
model=model):
|
||||||
|
if isinstance(preprocessing_function, str):
|
||||||
|
if preprocessing_function == 'fliplr':
|
||||||
|
preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
elif preprocessing_function in ('identity', 'orig'):
|
||||||
|
preprocessing_function = lambda x:x
|
||||||
|
else:
|
||||||
|
raise ValueError('unknown preprocessing_function:\t%s'
|
||||||
|
% preprocessing_function)
|
||||||
|
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
val_datagen.preprocessing_function = preprocessing_function
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
data_dir,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
gen_ = datagen_val_output
|
||||||
|
yhat = model.predict_generator(gen_,
|
||||||
|
steps=len(gen_),
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":gen_.filenames, "label": gen_.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
return dfres
|
||||||
|
##########################################
|
||||||
|
# HOLDOUT
|
||||||
|
##########################################
|
||||||
|
data_holdout = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_val'
|
||||||
|
dfres = get_predictions(
|
||||||
|
data_holdout,
|
||||||
|
preprocessing_function = lambda x:x,
|
||||||
|
model=model)
|
||||||
|
dfres.to_csv("predictions_val.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
dfres = get_predictions(
|
||||||
|
data_holdout,
|
||||||
|
preprocessing_function = preprocessing_function,
|
||||||
|
model=model)
|
||||||
|
|
||||||
|
dfres.to_csv("predictions_val_fliplr.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
# Test
|
||||||
|
##########################################
|
||||||
|
|
||||||
|
dfres = get_predictions(
|
||||||
|
prms.data_val,
|
||||||
|
preprocessing_function = lambda x:x,
|
||||||
|
model=model)
|
||||||
|
dfres.to_csv("predictions_test.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
|
||||||
|
preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
dfres = get_predictions(
|
||||||
|
prms.data_val,
|
||||||
|
preprocessing_function = preprocessing_function,
|
||||||
|
model=model)
|
||||||
|
dfres.to_csv("predictions_test_fliplr.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
# TRAIN
|
||||||
|
##########################################
|
||||||
|
dfres = get_predictions(
|
||||||
|
prms.data_train,
|
||||||
|
preprocessing_function = lambda x:x,
|
||||||
|
model=model)
|
||||||
|
dfres.to_csv("predictions_train.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
dfres = get_predictions(
|
||||||
|
prms.data_train,
|
||||||
|
preprocessing_function = preprocessing_function,
|
||||||
|
model=model)
|
||||||
|
dfres.to_csv("predictions_train_fliplr.csv", index=False)
|
||||||
|
|
||||||
+239
@@ -0,0 +1,239 @@
|
|||||||
|
from inception_short import get_model, get_num_files, get_class_weights
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
from image import ImageDataGenerator
|
||||||
|
#from keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
|
||||||
|
from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
|
||||||
|
from shutil import copy2
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
class AttrDict(dict):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AttrDict, self).__init__(*args, **kwargs)
|
||||||
|
self.__dict__ = self
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import numpy as np
|
||||||
|
import keras
|
||||||
|
from hashlib import md5
|
||||||
|
os.environ["PYTHONHASHSEED"]='0'
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
|
||||||
|
|
||||||
|
prms = AttrDict(
|
||||||
|
dropout=0.5,
|
||||||
|
base_trainable=True,
|
||||||
|
horizontal_flip = True,
|
||||||
|
vertical_flip = False,
|
||||||
|
zoom_range = [0.8, 1.2],
|
||||||
|
rotation_range = 15,
|
||||||
|
fill_mode='reflect',
|
||||||
|
ndense=0,
|
||||||
|
batch_size = 128*2,
|
||||||
|
init_epoch=0,
|
||||||
|
nb_epoch = 500,
|
||||||
|
data_augmentation = True,
|
||||||
|
contrast = None, #0.8,
|
||||||
|
truncate_quantile = None,#0.001,
|
||||||
|
ztransform = False,
|
||||||
|
oversampling = False,
|
||||||
|
#sampling_factor = None, [1, 6, 16, 64, 4],
|
||||||
|
seed=2,
|
||||||
|
width_shift_range = 0.125,
|
||||||
|
height_shift_range = 0.125,
|
||||||
|
class_mode = 'binary', # 'binary', #
|
||||||
|
n_classes = 1,
|
||||||
|
final_activation = 'sigmoid',
|
||||||
|
lr = 1e-4,
|
||||||
|
samplewise_center = False, #True
|
||||||
|
target_side = 99,
|
||||||
|
weightfile = None,
|
||||||
|
pretrained = True,
|
||||||
|
data_train = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train',
|
||||||
|
data_val = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test',
|
||||||
|
classes = ['normal', 'special'],
|
||||||
|
class_weights=None,#[1, 1, 4, 8, 4],
|
||||||
|
loss_weights = None,
|
||||||
|
ReduceLROnPlateau = dict(
|
||||||
|
monitor='val_loss',
|
||||||
|
factor=1/2,
|
||||||
|
patience=32,
|
||||||
|
verbose=0,
|
||||||
|
mode='auto', epsilon=0.001,
|
||||||
|
cooldown=32,
|
||||||
|
min_lr=1e-8,
|
||||||
|
),
|
||||||
|
# lr_cyclic_schedule = dict(
|
||||||
|
# #lr_init = 1.0e-3,
|
||||||
|
# drop = 2/5,
|
||||||
|
# epochs_drop = 20,
|
||||||
|
# cycle_len = 200.0
|
||||||
|
# )
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
paramhash = md5(str(prms).encode()).hexdigest()
|
||||||
|
|
||||||
|
prms["target_size"] = [ prms.target_side ]*2
|
||||||
|
|
||||||
|
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
|
||||||
|
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
|
||||||
|
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
|
||||||
|
# copy the script to the checkpoint directory
|
||||||
|
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
|
||||||
|
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
|
||||||
|
yaml.dump(dict(prms), outfh, default_flow_style=False)
|
||||||
|
|
||||||
|
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
|
||||||
|
print("loss:", prms["loss"])
|
||||||
|
|
||||||
|
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
|
||||||
|
|
||||||
|
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
|
||||||
|
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
#########################################
|
||||||
|
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
|
||||||
|
save_best_only=False, save_weights_only=False, mode='auto', period=1)
|
||||||
|
|
||||||
|
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
|
||||||
|
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
|
||||||
|
|
||||||
|
|
||||||
|
callback_list = [checkpoint, csv_callback]
|
||||||
|
|
||||||
|
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
|
||||||
|
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
|
||||||
|
|
||||||
|
elif "lr_cyclic_schedule" in prms:
|
||||||
|
callback_list.append(
|
||||||
|
LearningRateScheduler(
|
||||||
|
partial(lr_cyclic_schedule,
|
||||||
|
lr_init = prms.lr,
|
||||||
|
**prms.lr_cyclic_schedule)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
#########################################
|
||||||
|
model = get_model(n_classes=prms.n_classes,
|
||||||
|
final_activation=prms.final_activation,
|
||||||
|
ndense=prms.ndense,
|
||||||
|
dropout=prms.dropout,
|
||||||
|
base_trainable=prms.base_trainable,
|
||||||
|
weights = 'imagenet' if prms.pretrained else None,
|
||||||
|
input_shape = prms.target_size + [3])
|
||||||
|
|
||||||
|
|
||||||
|
#from keras.utils import plot_model
|
||||||
|
#plot_model(model, to_file='model.png')
|
||||||
|
|
||||||
|
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
|
||||||
|
metrics=['accuracy'],
|
||||||
|
)
|
||||||
|
#########################################
|
||||||
|
if prms.weightfile:
|
||||||
|
print("loading weights from:\t%s" % prms.weightfile)
|
||||||
|
model.load_weights(prms.weightfile)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
|
||||||
|
flowfromdir_params = dict(
|
||||||
|
#color_mode = "grayscale",
|
||||||
|
target_size=prms.target_size,
|
||||||
|
batch_size=prms.batch_size,
|
||||||
|
class_mode=prms.class_mode,
|
||||||
|
classes=prms.classes,
|
||||||
|
seed=prms.seed)
|
||||||
|
norm_params = dict(
|
||||||
|
#rescale=prms.scaleup,
|
||||||
|
samplewise_center=prms.samplewise_center,
|
||||||
|
samplewise_std_normalization=prms.samplewise_center,
|
||||||
|
featurewise_center=False,
|
||||||
|
featurewise_std_normalization=False,
|
||||||
|
zca_whitening=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _ztransform(x):
|
||||||
|
return (x-np.mean(x)) / np.std(x)
|
||||||
|
|
||||||
|
if 'preprocessing_function' in prms:
|
||||||
|
if prms.preprocessing_function=='ztransform':
|
||||||
|
preprocessing_function = _ztransform
|
||||||
|
elif prms.preprocessing_function=='m1p1':
|
||||||
|
preprocessing_function = lambda x: x/128.0 - 1
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown preprocessing_function")
|
||||||
|
else:
|
||||||
|
preprocessing_function = lambda x: x
|
||||||
|
|
||||||
|
|
||||||
|
if prms.data_augmentation:
|
||||||
|
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
train_datagen = ImageDataGenerator(
|
||||||
|
zoom_range=prms.zoom_range,
|
||||||
|
fill_mode=prms.fill_mode,
|
||||||
|
rotation_range = prms.rotation_range,
|
||||||
|
width_shift_range = prms.width_shift_range,
|
||||||
|
height_shift_range = prms.height_shift_range,
|
||||||
|
horizontal_flip=prms.horizontal_flip,
|
||||||
|
vertical_flip=prms.vertical_flip,
|
||||||
|
contrast = prms.contrast,
|
||||||
|
z_transform = prms.ztransform,
|
||||||
|
truncate_quantile = prms.truncate_quantile,
|
||||||
|
#histeq_alpha=prms.histeq_alpha,
|
||||||
|
**norm_params)
|
||||||
|
else:
|
||||||
|
train_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
stratify = prms.oversampling,
|
||||||
|
sampling_factor=prms.sampling_factor if (prms.oversampling) else None,
|
||||||
|
oversampling=prms.oversampling,
|
||||||
|
shuffle=True, **flowfromdir_params)
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output.filenames)/prms['batch_size']
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
#########################################
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
model.fit_generator(datagen_train_output,
|
||||||
|
steps_per_epoch=STEPS_PER_EPOCH,
|
||||||
|
epochs=prms.nb_epoch, verbose=1,
|
||||||
|
validation_data=datagen_val_output,
|
||||||
|
validation_steps=VALIDATION_STEPS,
|
||||||
|
#class_weight='auto',
|
||||||
|
class_weight=class_weights,
|
||||||
|
callbacks=callback_list,
|
||||||
|
initial_epoch=prms.init_epoch)
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
print("""loss\t%.4f
|
||||||
|
accuracy\t%.4f\n""" %
|
||||||
|
tuple(model.evaluate_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
workers=1,
|
||||||
|
pickle_safe=True)))
|
||||||
|
|
||||||
|
|
||||||
|
#model.predict()
|
||||||
@@ -0,0 +1,48 @@
|
|||||||
|
ReduceLROnPlateau:
|
||||||
|
cooldown: 8
|
||||||
|
epsilon: 0.001
|
||||||
|
factor: 0.5
|
||||||
|
min_lr: 1.0e-12
|
||||||
|
mode: auto
|
||||||
|
monitor: val_loss
|
||||||
|
patience: 64
|
||||||
|
verbose: 0
|
||||||
|
base_trainable: false
|
||||||
|
batch_size: 16
|
||||||
|
class_mode: categorical
|
||||||
|
class_weights:
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
classes:
|
||||||
|
- normal
|
||||||
|
- wire
|
||||||
|
data_augmentation: true
|
||||||
|
data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
|
||||||
|
data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
|
||||||
|
dropout: 0.5
|
||||||
|
fill_mode: reflect
|
||||||
|
final_activation: softmax
|
||||||
|
height_shift_range: 0.125
|
||||||
|
horizontal_flip: true
|
||||||
|
init_epoch: 0
|
||||||
|
lr: 0.001
|
||||||
|
n_classes: 2
|
||||||
|
nb_epoch: 500
|
||||||
|
ndense: 0
|
||||||
|
oversampling: false
|
||||||
|
rescale: 1
|
||||||
|
rotation_range: 30
|
||||||
|
samplewise_center: false
|
||||||
|
seed: 1
|
||||||
|
target_side: 299
|
||||||
|
target_size:
|
||||||
|
- 299
|
||||||
|
- 299
|
||||||
|
truncate_quantile: null
|
||||||
|
vertical_flip: false
|
||||||
|
weightfile: null
|
||||||
|
width_shift_range: 0.125
|
||||||
|
zoom_range:
|
||||||
|
- 0.8
|
||||||
|
- 1.2
|
||||||
|
ztransform: true
|
||||||
+49
@@ -0,0 +1,49 @@
|
|||||||
|
ReduceLROnPlateau:
|
||||||
|
cooldown: 8
|
||||||
|
epsilon: 0.001
|
||||||
|
factor: 0.5
|
||||||
|
min_lr: 1.0e-12
|
||||||
|
mode: auto
|
||||||
|
monitor: val_loss
|
||||||
|
patience: 64
|
||||||
|
verbose: 0
|
||||||
|
base_trainable: false
|
||||||
|
batch_size: 16
|
||||||
|
class_mode: categorical
|
||||||
|
class_weights:
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
classes:
|
||||||
|
- normal
|
||||||
|
- wire
|
||||||
|
data_augmentation: true
|
||||||
|
data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
|
||||||
|
data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
|
||||||
|
data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
|
||||||
|
dropout: 0.5
|
||||||
|
fill_mode: reflect
|
||||||
|
final_activation: softmax
|
||||||
|
height_shift_range: 0.125
|
||||||
|
horizontal_flip: true
|
||||||
|
init_epoch: 0
|
||||||
|
lr: 0.001
|
||||||
|
n_classes: 2
|
||||||
|
nb_epoch: 500
|
||||||
|
ndense: 0
|
||||||
|
oversampling: false
|
||||||
|
rescale: 1
|
||||||
|
rotation_range: 30
|
||||||
|
samplewise_center: false
|
||||||
|
seed: 2
|
||||||
|
target_side: 299
|
||||||
|
target_size:
|
||||||
|
- 299
|
||||||
|
- 299
|
||||||
|
truncate_quantile: null
|
||||||
|
vertical_flip: false
|
||||||
|
weightfile: model.147-0.000774.hdf5
|
||||||
|
width_shift_range: 0.125
|
||||||
|
zoom_range:
|
||||||
|
- 0.8
|
||||||
|
- 1.2
|
||||||
|
ztransform: true
|
||||||
+315
@@ -0,0 +1,315 @@
|
|||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
sys.path.append('../..')
|
||||||
|
|
||||||
|
from inception_short import get_model, get_num_files, get_class_weights
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
from image import ImageDataGenerator
|
||||||
|
#from keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
|
||||||
|
from checkpoint_utils import CSVWallClockLogger
|
||||||
|
from shutil import copy2
|
||||||
|
from losses import acc_0, acc_1, acc_2, acc_3, acc_4
|
||||||
|
|
||||||
|
class AttrDict(dict):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AttrDict, self).__init__(*args, **kwargs)
|
||||||
|
self.__dict__ = self
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import numpy as np
|
||||||
|
import keras
|
||||||
|
from hashlib import md5
|
||||||
|
os.environ["PYTHONHASHSEED"]='0'
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"]="0"
|
||||||
|
|
||||||
|
prms = AttrDict(
|
||||||
|
dropout=0.5,
|
||||||
|
base_trainable=False,
|
||||||
|
horizontal_flip = True,
|
||||||
|
vertical_flip = False,
|
||||||
|
zoom_range = [0.8, 1.2],
|
||||||
|
rotation_range = 30,
|
||||||
|
fill_mode='reflect',
|
||||||
|
ndense=0,
|
||||||
|
batch_size = 16,
|
||||||
|
init_epoch=0,
|
||||||
|
nb_epoch = 500,
|
||||||
|
data_augmentation = True,
|
||||||
|
rescale = 1, #2**-8,
|
||||||
|
#contrast = 0.9,
|
||||||
|
truncate_quantile = None,#0.001,
|
||||||
|
ztransform = True,
|
||||||
|
oversampling = False,
|
||||||
|
#sampling_factor = [1, 4],
|
||||||
|
seed=2,
|
||||||
|
width_shift_range = 0.125,
|
||||||
|
height_shift_range = 0.125,
|
||||||
|
class_mode = 'categorical', # 'binary', #
|
||||||
|
n_classes = 2,
|
||||||
|
final_activation = "softmax", # 'sigmoid',
|
||||||
|
lr = 1e-3,
|
||||||
|
samplewise_center = False, #True
|
||||||
|
target_side = 299,
|
||||||
|
#weights = None,
|
||||||
|
weightfile = "model.147-0.000774.hdf5",
|
||||||
|
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
|
||||||
|
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
|
||||||
|
data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
|
||||||
|
classes = ["normal", "wire"],
|
||||||
|
class_weights=[1, 1],
|
||||||
|
ReduceLROnPlateau = dict(
|
||||||
|
monitor='val_loss',
|
||||||
|
factor=1/2,
|
||||||
|
patience=32*2,
|
||||||
|
verbose=0,
|
||||||
|
mode='auto', epsilon=0.001,
|
||||||
|
cooldown=8,
|
||||||
|
min_lr=1e-12,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
paramhash = md5(str(prms).encode()).hexdigest()
|
||||||
|
|
||||||
|
prms["target_size"] = [ prms.target_side ]*2
|
||||||
|
|
||||||
|
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
|
||||||
|
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
|
||||||
|
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
|
||||||
|
# copy the script to the checkpoint directory
|
||||||
|
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
|
||||||
|
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
|
||||||
|
yaml.dump(dict(prms), outfh, default_flow_style=False)
|
||||||
|
# w_categorical_crossentropy
|
||||||
|
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
|
||||||
|
|
||||||
|
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
|
||||||
|
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
#########################################
|
||||||
|
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
|
||||||
|
save_best_only=True, save_weights_only=False, mode='auto', period=1)
|
||||||
|
|
||||||
|
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
|
||||||
|
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
|
||||||
|
|
||||||
|
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
|
||||||
|
|
||||||
|
callback_list = [checkpoint, csv_callback]
|
||||||
|
|
||||||
|
|
||||||
|
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
|
||||||
|
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
model = get_model(n_classes=prms.n_classes,
|
||||||
|
final_activation=prms.final_activation,
|
||||||
|
ndense=prms.ndense,
|
||||||
|
#weights = prms.weights,
|
||||||
|
dropout=prms.dropout,
|
||||||
|
base_trainable=prms.base_trainable)
|
||||||
|
|
||||||
|
|
||||||
|
#from keras.utils import plot_model
|
||||||
|
#plot_model(model, to_file='model.png')
|
||||||
|
if __name__ == '__main__':
|
||||||
|
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
|
||||||
|
metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
|
||||||
|
],
|
||||||
|
)
|
||||||
|
#########################################
|
||||||
|
if prms.weightfile:
|
||||||
|
print("loading weights from:\t%s" % prms.weightfile)
|
||||||
|
model.load_weights(prms.weightfile)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
|
||||||
|
flowfromdir_params = dict(
|
||||||
|
#color_mode = "grayscale",
|
||||||
|
target_size=prms.target_size,
|
||||||
|
batch_size=prms.batch_size,
|
||||||
|
class_mode=prms.class_mode,
|
||||||
|
classes=prms.classes,
|
||||||
|
seed=prms.seed)
|
||||||
|
norm_params = dict(
|
||||||
|
rescale=prms.rescale,
|
||||||
|
samplewise_center=prms.samplewise_center,
|
||||||
|
samplewise_std_normalization=prms.samplewise_center,
|
||||||
|
featurewise_center=False,
|
||||||
|
featurewise_std_normalization=False,
|
||||||
|
zca_whitening=False,
|
||||||
|
z_transform = prms.ztransform,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _ztransform(x):
|
||||||
|
return (x-np.mean(x)) / np.std(x)
|
||||||
|
|
||||||
|
if 'preprocessing_function' in prms:
|
||||||
|
if prms.preprocessing_function=='ztransform':
|
||||||
|
preprocessing_function = _ztransform
|
||||||
|
elif prms.preprocessing_function=='m1p1':
|
||||||
|
preprocessing_function = lambda x: x/128.0 - 1
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown preprocessing_function")
|
||||||
|
else:
|
||||||
|
preprocessing_function = lambda x: x
|
||||||
|
|
||||||
|
if prms.data_augmentation:
|
||||||
|
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
train_datagen = ImageDataGenerator(
|
||||||
|
zoom_range=prms.zoom_range,
|
||||||
|
fill_mode=prms.fill_mode,
|
||||||
|
rotation_range = prms.rotation_range,
|
||||||
|
width_shift_range = prms.width_shift_range,
|
||||||
|
height_shift_range = prms.height_shift_range,
|
||||||
|
horizontal_flip=prms.horizontal_flip,
|
||||||
|
vertical_flip=prms.vertical_flip,
|
||||||
|
contrast = prms.contrast if "contrast" in prms else None,
|
||||||
|
truncate_quantile = prms.truncate_quantile,
|
||||||
|
#histeq_alpha=prms.histeq_alpha,
|
||||||
|
**norm_params)
|
||||||
|
else:
|
||||||
|
train_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
|
||||||
|
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
# HOLDOUT
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_holdout, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_holdout.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
# HOLDOUT FLIPPED
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_holdout, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
# VAL
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_test.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
# VAL FLIPPED
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_test_fliplr.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
|
||||||
|
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_train_output,
|
||||||
|
steps=STEPS_PER_EPOCH,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
|
||||||
|
##ipdb.set_trace()
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_train.csv", index=False)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
|
||||||
|
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_train_output,
|
||||||
|
steps=STEPS_PER_EPOCH,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
|
||||||
|
##ipdb.set_trace()
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_train_filplr.csv", index=False)
|
||||||
+50
@@ -0,0 +1,50 @@
|
|||||||
|
ReduceLROnPlateau:
|
||||||
|
cooldown: 8
|
||||||
|
epsilon: 0.001
|
||||||
|
factor: 0.5
|
||||||
|
min_lr: 1.0e-12
|
||||||
|
mode: auto
|
||||||
|
monitor: val_loss
|
||||||
|
patience: 64
|
||||||
|
verbose: 0
|
||||||
|
base_trainable: false
|
||||||
|
batch_size: 16
|
||||||
|
class_mode: categorical
|
||||||
|
class_weights:
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
classes:
|
||||||
|
- normal
|
||||||
|
- wire
|
||||||
|
data_augmentation: true
|
||||||
|
data_everything: /media/exx/tron/2017-07-png-jae/
|
||||||
|
data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
|
||||||
|
data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
|
||||||
|
data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
|
||||||
|
dropout: 0.5
|
||||||
|
fill_mode: reflect
|
||||||
|
final_activation: softmax
|
||||||
|
height_shift_range: 0.125
|
||||||
|
horizontal_flip: true
|
||||||
|
init_epoch: 0
|
||||||
|
lr: 0.001
|
||||||
|
n_classes: 2
|
||||||
|
nb_epoch: 500
|
||||||
|
ndense: 0
|
||||||
|
oversampling: false
|
||||||
|
rescale: 1
|
||||||
|
rotation_range: 30
|
||||||
|
samplewise_center: false
|
||||||
|
seed: 2
|
||||||
|
target_side: 299
|
||||||
|
target_size:
|
||||||
|
- 299
|
||||||
|
- 299
|
||||||
|
truncate_quantile: null
|
||||||
|
vertical_flip: false
|
||||||
|
weightfile: model.147-0.000774.hdf5
|
||||||
|
width_shift_range: 0.125
|
||||||
|
zoom_range:
|
||||||
|
- 0.8
|
||||||
|
- 1.2
|
||||||
|
ztransform: true
|
||||||
+398
@@ -0,0 +1,398 @@
|
|||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
sys.path.append('../..')
|
||||||
|
sys.path.append("/data/dlituiev/kerastrainutils/")
|
||||||
|
|
||||||
|
from inception_short import get_model, get_num_files, get_class_weights
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
from _image import ImageDataGenerator
|
||||||
|
#from keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
|
||||||
|
from checkpoint_utils import CSVWallClockLogger
|
||||||
|
from shutil import copy2
|
||||||
|
from losses import acc_0, acc_1, acc_2, acc_3, acc_4
|
||||||
|
|
||||||
|
class AttrDict(dict):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AttrDict, self).__init__(*args, **kwargs)
|
||||||
|
self.__dict__ = self
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import numpy as np
|
||||||
|
import keras
|
||||||
|
from hashlib import md5
|
||||||
|
os.environ["PYTHONHASHSEED"]='0'
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"]="3"
|
||||||
|
|
||||||
|
prms = AttrDict(
|
||||||
|
dropout=0.5,
|
||||||
|
base_trainable=False,
|
||||||
|
horizontal_flip = True,
|
||||||
|
vertical_flip = False,
|
||||||
|
zoom_range = [0.8, 1.2],
|
||||||
|
rotation_range = 30,
|
||||||
|
fill_mode='reflect',
|
||||||
|
ndense=0,
|
||||||
|
batch_size = 16,
|
||||||
|
init_epoch=0,
|
||||||
|
nb_epoch = 500,
|
||||||
|
data_augmentation = True,
|
||||||
|
rescale = 1, #2**-8,
|
||||||
|
#contrast = 0.9,
|
||||||
|
truncate_quantile = None,#0.001,
|
||||||
|
ztransform = True,
|
||||||
|
oversampling = False,
|
||||||
|
#sampling_factor = [1, 4],
|
||||||
|
seed=2,
|
||||||
|
width_shift_range = 0.125,
|
||||||
|
height_shift_range = 0.125,
|
||||||
|
class_mode = 'categorical', # 'binary', #
|
||||||
|
n_classes = 2,
|
||||||
|
final_activation = "softmax", # 'sigmoid',
|
||||||
|
lr = 1e-3,
|
||||||
|
samplewise_center = False, #True
|
||||||
|
target_side = 299,
|
||||||
|
#weights = None,
|
||||||
|
weightfile = "model.147-0.000774.hdf5",
|
||||||
|
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
|
||||||
|
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
|
||||||
|
data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
|
||||||
|
data_everything = "/media/exx/tron/2017-07-png-jae/",
|
||||||
|
classes = ["normal", "wire"],
|
||||||
|
class_weights=[1, 1],
|
||||||
|
ReduceLROnPlateau = dict(
|
||||||
|
monitor='val_loss',
|
||||||
|
factor=1/2,
|
||||||
|
patience=32*2,
|
||||||
|
verbose=0,
|
||||||
|
mode='auto', epsilon=0.001,
|
||||||
|
cooldown=8,
|
||||||
|
min_lr=1e-12,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
paramhash = md5(str(prms).encode()).hexdigest()
|
||||||
|
|
||||||
|
prms["target_size"] = [ prms.target_side ]*2
|
||||||
|
|
||||||
|
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
|
||||||
|
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
|
||||||
|
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
|
||||||
|
# copy the script to the checkpoint directory
|
||||||
|
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
|
||||||
|
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
|
||||||
|
yaml.dump(dict(prms), outfh, default_flow_style=False)
|
||||||
|
# w_categorical_crossentropy
|
||||||
|
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
|
||||||
|
|
||||||
|
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
|
||||||
|
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
#########################################
|
||||||
|
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
|
||||||
|
save_best_only=True, save_weights_only=False, mode='auto', period=1)
|
||||||
|
|
||||||
|
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
|
||||||
|
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
|
||||||
|
|
||||||
|
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
|
||||||
|
|
||||||
|
callback_list = [checkpoint, csv_callback]
|
||||||
|
|
||||||
|
|
||||||
|
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
|
||||||
|
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
model = get_model(n_classes=prms.n_classes,
|
||||||
|
final_activation=prms.final_activation,
|
||||||
|
ndense=prms.ndense,
|
||||||
|
#weights = prms.weights,
|
||||||
|
dropout=prms.dropout,
|
||||||
|
base_trainable=prms.base_trainable)
|
||||||
|
|
||||||
|
|
||||||
|
#from keras.utils import plot_model
|
||||||
|
#plot_model(model, to_file='model.png')
|
||||||
|
if __name__ == '__main__':
|
||||||
|
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
|
||||||
|
metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
|
||||||
|
],
|
||||||
|
)
|
||||||
|
#########################################
|
||||||
|
if prms.weightfile:
|
||||||
|
print("loading weights from:\t%s" % prms.weightfile)
|
||||||
|
model.load_weights(prms.weightfile)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
|
||||||
|
flowfromdir_params = dict(
|
||||||
|
#color_mode = "grayscale",
|
||||||
|
target_size=prms.target_size,
|
||||||
|
batch_size=prms.batch_size,
|
||||||
|
class_mode=prms.class_mode,
|
||||||
|
classes=prms.classes,
|
||||||
|
seed=prms.seed)
|
||||||
|
norm_params = dict(
|
||||||
|
rescale=prms.rescale,
|
||||||
|
samplewise_center=prms.samplewise_center,
|
||||||
|
samplewise_std_normalization=prms.samplewise_center,
|
||||||
|
featurewise_center=False,
|
||||||
|
featurewise_std_normalization=False,
|
||||||
|
zca_whitening=False,
|
||||||
|
z_transform = prms.ztransform,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _ztransform(x):
|
||||||
|
return (x-np.mean(x)) / np.std(x)
|
||||||
|
|
||||||
|
if 'preprocessing_function' in prms:
|
||||||
|
if prms.preprocessing_function=='ztransform':
|
||||||
|
preprocessing_function = _ztransform
|
||||||
|
elif prms.preprocessing_function=='m1p1':
|
||||||
|
preprocessing_function = lambda x: x/128.0 - 1
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown preprocessing_function")
|
||||||
|
else:
|
||||||
|
preprocessing_function = lambda x: x
|
||||||
|
|
||||||
|
if prms.data_augmentation:
|
||||||
|
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
train_datagen = ImageDataGenerator(
|
||||||
|
zoom_range=prms.zoom_range,
|
||||||
|
fill_mode=prms.fill_mode,
|
||||||
|
rotation_range = prms.rotation_range,
|
||||||
|
width_shift_range = prms.width_shift_range,
|
||||||
|
height_shift_range = prms.height_shift_range,
|
||||||
|
horizontal_flip=prms.horizontal_flip,
|
||||||
|
vertical_flip=prms.vertical_flip,
|
||||||
|
#contrast = prms.contrast if "contrast" in prms else None,
|
||||||
|
#truncate_quantile = prms.truncate_quantile,
|
||||||
|
#histeq_alpha=prms.histeq_alpha,
|
||||||
|
**norm_params)
|
||||||
|
else:
|
||||||
|
train_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
##########################################
|
||||||
|
# Everything
|
||||||
|
##########################################
|
||||||
|
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
##########################################
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything_fliplr.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything_flipud.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything_fliplrud.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
##########################################
|
||||||
|
# DONE
|
||||||
|
##########################################
|
||||||
|
sys.exit(1)
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
# HOLDOUT
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_holdout, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_holdout.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
# HOLDOUT FLIPPED
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_holdout, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
# VAL
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_test.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
# VAL FLIPPED
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_test_fliplr.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
|
||||||
|
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_train_output,
|
||||||
|
steps=STEPS_PER_EPOCH,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
|
||||||
|
##ipdb.set_trace()
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_train.csv", index=False)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
|
||||||
|
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_train_output,
|
||||||
|
steps=STEPS_PER_EPOCH,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
|
||||||
|
##ipdb.set_trace()
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_train_fliplr.csv", index=False)
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
../inception_short.py
|
||||||
+398
@@ -0,0 +1,398 @@
|
|||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
sys.path.append('../..')
|
||||||
|
sys.path.append("/data/dlituiev/kerastrainutils/")
|
||||||
|
|
||||||
|
from inception_short import get_model, get_num_files, get_class_weights
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
from _image import ImageDataGenerator
|
||||||
|
#from keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
|
||||||
|
from checkpoint_utils import CSVWallClockLogger
|
||||||
|
from shutil import copy2
|
||||||
|
from losses import acc_0, acc_1, acc_2, acc_3, acc_4
|
||||||
|
|
||||||
|
class AttrDict(dict):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AttrDict, self).__init__(*args, **kwargs)
|
||||||
|
self.__dict__ = self
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import numpy as np
|
||||||
|
import keras
|
||||||
|
from hashlib import md5
|
||||||
|
os.environ["PYTHONHASHSEED"]='0'
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"]="3"
|
||||||
|
|
||||||
|
prms = AttrDict(
|
||||||
|
dropout=0.5,
|
||||||
|
base_trainable=False,
|
||||||
|
horizontal_flip = True,
|
||||||
|
vertical_flip = False,
|
||||||
|
zoom_range = [0.8, 1.2],
|
||||||
|
rotation_range = 30,
|
||||||
|
fill_mode='reflect',
|
||||||
|
ndense=0,
|
||||||
|
batch_size = 16,
|
||||||
|
init_epoch=0,
|
||||||
|
nb_epoch = 500,
|
||||||
|
data_augmentation = True,
|
||||||
|
rescale = 1, #2**-8,
|
||||||
|
#contrast = 0.9,
|
||||||
|
truncate_quantile = None,#0.001,
|
||||||
|
ztransform = True,
|
||||||
|
oversampling = False,
|
||||||
|
#sampling_factor = [1, 4],
|
||||||
|
seed=2,
|
||||||
|
width_shift_range = 0.125,
|
||||||
|
height_shift_range = 0.125,
|
||||||
|
class_mode = 'categorical', # 'binary', #
|
||||||
|
n_classes = 2,
|
||||||
|
final_activation = "softmax", # 'sigmoid',
|
||||||
|
lr = 1e-3,
|
||||||
|
samplewise_center = False, #True
|
||||||
|
target_side = 299,
|
||||||
|
#weights = None,
|
||||||
|
weightfile = "model.147-0.000774.hdf5",
|
||||||
|
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
|
||||||
|
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
|
||||||
|
data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
|
||||||
|
data_everything = "/media/exx/tron/2017-07-png-jae/",
|
||||||
|
classes = ["normal", "wire"],
|
||||||
|
class_weights=[1, 1],
|
||||||
|
ReduceLROnPlateau = dict(
|
||||||
|
monitor='val_loss',
|
||||||
|
factor=1/2,
|
||||||
|
patience=32*2,
|
||||||
|
verbose=0,
|
||||||
|
mode='auto', epsilon=0.001,
|
||||||
|
cooldown=8,
|
||||||
|
min_lr=1e-12,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
paramhash = md5(str(prms).encode()).hexdigest()
|
||||||
|
|
||||||
|
prms["target_size"] = [ prms.target_side ]*2
|
||||||
|
|
||||||
|
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
|
||||||
|
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
|
||||||
|
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
|
||||||
|
# copy the script to the checkpoint directory
|
||||||
|
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
|
||||||
|
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
|
||||||
|
yaml.dump(dict(prms), outfh, default_flow_style=False)
|
||||||
|
# w_categorical_crossentropy
|
||||||
|
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
|
||||||
|
|
||||||
|
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
|
||||||
|
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
#########################################
|
||||||
|
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
|
||||||
|
save_best_only=True, save_weights_only=False, mode='auto', period=1)
|
||||||
|
|
||||||
|
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
|
||||||
|
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
|
||||||
|
|
||||||
|
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
|
||||||
|
|
||||||
|
callback_list = [checkpoint, csv_callback]
|
||||||
|
|
||||||
|
|
||||||
|
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
|
||||||
|
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
model = get_model(n_classes=prms.n_classes,
|
||||||
|
final_activation=prms.final_activation,
|
||||||
|
ndense=prms.ndense,
|
||||||
|
#weights = prms.weights,
|
||||||
|
dropout=prms.dropout,
|
||||||
|
base_trainable=prms.base_trainable)
|
||||||
|
|
||||||
|
|
||||||
|
#from keras.utils import plot_model
|
||||||
|
#plot_model(model, to_file='model.png')
|
||||||
|
if __name__ == '__main__':
|
||||||
|
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
|
||||||
|
metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
|
||||||
|
],
|
||||||
|
)
|
||||||
|
#########################################
|
||||||
|
if prms.weightfile:
|
||||||
|
print("loading weights from:\t%s" % prms.weightfile)
|
||||||
|
model.load_weights(prms.weightfile)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
|
||||||
|
flowfromdir_params = dict(
|
||||||
|
#color_mode = "grayscale",
|
||||||
|
target_size=prms.target_size,
|
||||||
|
batch_size=prms.batch_size,
|
||||||
|
class_mode=prms.class_mode,
|
||||||
|
classes=prms.classes,
|
||||||
|
seed=prms.seed)
|
||||||
|
norm_params = dict(
|
||||||
|
rescale=prms.rescale,
|
||||||
|
samplewise_center=prms.samplewise_center,
|
||||||
|
samplewise_std_normalization=prms.samplewise_center,
|
||||||
|
featurewise_center=False,
|
||||||
|
featurewise_std_normalization=False,
|
||||||
|
zca_whitening=False,
|
||||||
|
z_transform = prms.ztransform,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _ztransform(x):
|
||||||
|
return (x-np.mean(x)) / np.std(x)
|
||||||
|
|
||||||
|
if 'preprocessing_function' in prms:
|
||||||
|
if prms.preprocessing_function=='ztransform':
|
||||||
|
preprocessing_function = _ztransform
|
||||||
|
elif prms.preprocessing_function=='m1p1':
|
||||||
|
preprocessing_function = lambda x: x/128.0 - 1
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown preprocessing_function")
|
||||||
|
else:
|
||||||
|
preprocessing_function = lambda x: x
|
||||||
|
|
||||||
|
if prms.data_augmentation:
|
||||||
|
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
train_datagen = ImageDataGenerator(
|
||||||
|
zoom_range=prms.zoom_range,
|
||||||
|
fill_mode=prms.fill_mode,
|
||||||
|
rotation_range = prms.rotation_range,
|
||||||
|
width_shift_range = prms.width_shift_range,
|
||||||
|
height_shift_range = prms.height_shift_range,
|
||||||
|
horizontal_flip=prms.horizontal_flip,
|
||||||
|
vertical_flip=prms.vertical_flip,
|
||||||
|
#contrast = prms.contrast if "contrast" in prms else None,
|
||||||
|
#truncate_quantile = prms.truncate_quantile,
|
||||||
|
#histeq_alpha=prms.histeq_alpha,
|
||||||
|
**norm_params)
|
||||||
|
else:
|
||||||
|
train_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
##########################################
|
||||||
|
# Everything
|
||||||
|
##########################################
|
||||||
|
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
##########################################
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything_fliplr.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything_flipud.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
os.path.dirname(prms.data_everything.rstrip('/')),
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
VALIDATION_STEPS = len(datagen_val_output)
|
||||||
|
pred_fn = "predictions_everything_fliplrud.csv"
|
||||||
|
with open(pred_fn, 'w+') as fh:
|
||||||
|
print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
|
||||||
|
for ii, batch in enumerate(datagen_val_output):
|
||||||
|
if ii> VALIDATION_STEPS:
|
||||||
|
break
|
||||||
|
yhat = model.predict_on_batch(batch[0])
|
||||||
|
filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
|
||||||
|
for fnimg, yhat_ in zip(filenames, yhat):
|
||||||
|
print(fnimg, *yhat_, sep=',', file = fh)
|
||||||
|
##########################################
|
||||||
|
# DONE
|
||||||
|
##########################################
|
||||||
|
sys.exit(1)
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
# HOLDOUT
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_holdout, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_holdout.csv", index=False)
|
||||||
|
##########################################
|
||||||
|
# HOLDOUT FLIPPED
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_holdout, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
# VAL
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_test.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
# VAL FLIPPED
|
||||||
|
##########################################
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params, )
|
||||||
|
val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_test_fliplr.csv", index=False)
|
||||||
|
#########################################
|
||||||
|
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
|
||||||
|
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_train_output,
|
||||||
|
steps=STEPS_PER_EPOCH,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
|
||||||
|
##ipdb.set_trace()
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_train.csv", index=False)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
|
||||||
|
STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
yhat = model.predict_generator(datagen_train_output,
|
||||||
|
steps=STEPS_PER_EPOCH,
|
||||||
|
verbose=1,)
|
||||||
|
|
||||||
|
dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
|
||||||
|
dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
|
||||||
|
##ipdb.set_trace()
|
||||||
|
dfres = pd.DataFrame(dfdict)
|
||||||
|
dfres.to_csv("predictions_train_fliplr.csv", index=False)
|
||||||
+224
@@ -0,0 +1,224 @@
|
|||||||
|
from inception_short import get_model, get_num_files, get_class_weights
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
from image import ImageDataGenerator
|
||||||
|
#from keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
|
||||||
|
from checkpoint_utils import CSVWallClockLogger
|
||||||
|
from shutil import copy2
|
||||||
|
|
||||||
|
class AttrDict(dict):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AttrDict, self).__init__(*args, **kwargs)
|
||||||
|
self.__dict__ = self
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import numpy as np
|
||||||
|
import keras
|
||||||
|
from hashlib import md5
|
||||||
|
os.environ["PYTHONHASHSEED"]='0'
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"]="1"
|
||||||
|
|
||||||
|
prms = AttrDict(
|
||||||
|
dropout=0.5,
|
||||||
|
base_trainable=False,
|
||||||
|
horizontal_flip = True,
|
||||||
|
vertical_flip = False,
|
||||||
|
zoom_range = [0.8, 1.2],
|
||||||
|
rotation_range = 30,
|
||||||
|
fill_mode='reflect',
|
||||||
|
ndense=0,
|
||||||
|
batch_size = 16,
|
||||||
|
init_epoch=0,
|
||||||
|
nb_epoch = 500,
|
||||||
|
data_augmentation = True,
|
||||||
|
rescale = 1, #2**-8,
|
||||||
|
#contrast = 0.9,
|
||||||
|
truncate_quantile = None,#0.001,
|
||||||
|
ztransform = True,
|
||||||
|
oversampling = False,
|
||||||
|
#sampling_factor = [1, 4],
|
||||||
|
seed=1,
|
||||||
|
width_shift_range = 0.125,
|
||||||
|
height_shift_range = 0.125,
|
||||||
|
class_mode = 'categorical', # 'binary', #
|
||||||
|
n_classes = 2,
|
||||||
|
final_activation = "softmax", # 'sigmoid',
|
||||||
|
lr = 1e-3,
|
||||||
|
samplewise_center = False, #True
|
||||||
|
target_side = 299,
|
||||||
|
#weights = None,
|
||||||
|
weightfile = None, #"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.31-1.290145.hdf5",
|
||||||
|
#"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.59-1.676424.hdf5",
|
||||||
|
data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
|
||||||
|
data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
|
||||||
|
classes = ["normal", "wire"],
|
||||||
|
class_weights=[1, 1],
|
||||||
|
ReduceLROnPlateau = dict(
|
||||||
|
monitor='val_loss',
|
||||||
|
factor=1/2,
|
||||||
|
patience=32*2,
|
||||||
|
verbose=0,
|
||||||
|
mode='auto', epsilon=0.001,
|
||||||
|
cooldown=8,
|
||||||
|
min_lr=1e-12,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
paramhash = md5(str(prms).encode()).hexdigest()
|
||||||
|
|
||||||
|
prms["target_size"] = [ prms.target_side ]*2
|
||||||
|
|
||||||
|
CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
|
||||||
|
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
|
||||||
|
print("SAVING TO:\t%s" % CHECKPOINT_DIR)
|
||||||
|
# copy the script to the checkpoint directory
|
||||||
|
copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
|
||||||
|
with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
|
||||||
|
yaml.dump(dict(prms), outfh, default_flow_style=False)
|
||||||
|
prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
|
||||||
|
|
||||||
|
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
|
||||||
|
|
||||||
|
SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
|
||||||
|
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
|
||||||
|
|
||||||
|
print('='*50)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
#########################################
|
||||||
|
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
|
||||||
|
save_best_only=True, save_weights_only=False, mode='auto', period=1)
|
||||||
|
|
||||||
|
csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
|
||||||
|
csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
|
||||||
|
|
||||||
|
|
||||||
|
callback_list = [checkpoint, csv_callback]
|
||||||
|
|
||||||
|
|
||||||
|
if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
|
||||||
|
callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
model = get_model(n_classes=prms.n_classes,
|
||||||
|
final_activation=prms.final_activation,
|
||||||
|
ndense=prms.ndense,
|
||||||
|
#weights = prms.weights,
|
||||||
|
dropout=prms.dropout,
|
||||||
|
base_trainable=prms.base_trainable)
|
||||||
|
|
||||||
|
|
||||||
|
#from keras.utils import plot_model
|
||||||
|
#plot_model(model, to_file='model.png')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
|
||||||
|
metrics=['accuracy', #acc_0, acc_1,# acc_2, acc_3, acc_4
|
||||||
|
],
|
||||||
|
)
|
||||||
|
#########################################
|
||||||
|
if prms.weightfile:
|
||||||
|
print("loading weights from:\t%s" % prms.weightfile)
|
||||||
|
model.load_weights(prms.weightfile)
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
|
||||||
|
flowfromdir_params = dict(
|
||||||
|
#color_mode = "grayscale",
|
||||||
|
target_size=prms.target_size,
|
||||||
|
batch_size=prms.batch_size,
|
||||||
|
class_mode=prms.class_mode,
|
||||||
|
classes=prms.classes,
|
||||||
|
seed=prms.seed)
|
||||||
|
norm_params = dict(
|
||||||
|
rescale=prms.rescale,
|
||||||
|
samplewise_center=prms.samplewise_center,
|
||||||
|
samplewise_std_normalization=prms.samplewise_center,
|
||||||
|
featurewise_center=False,
|
||||||
|
featurewise_std_normalization=False,
|
||||||
|
zca_whitening=False,
|
||||||
|
z_transform = prms.ztransform,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _ztransform(x):
|
||||||
|
return (x-np.mean(x)) / np.std(x)
|
||||||
|
|
||||||
|
if 'preprocessing_function' in prms:
|
||||||
|
if prms.preprocessing_function=='ztransform':
|
||||||
|
preprocessing_function = _ztransform
|
||||||
|
elif prms.preprocessing_function=='m1p1':
|
||||||
|
preprocessing_function = lambda x: x/128.0 - 1
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown preprocessing_function")
|
||||||
|
else:
|
||||||
|
preprocessing_function = lambda x: x
|
||||||
|
|
||||||
|
|
||||||
|
if prms.data_augmentation:
|
||||||
|
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
train_datagen = ImageDataGenerator(
|
||||||
|
zoom_range=prms.zoom_range,
|
||||||
|
fill_mode=prms.fill_mode,
|
||||||
|
rotation_range = prms.rotation_range,
|
||||||
|
width_shift_range = prms.width_shift_range,
|
||||||
|
height_shift_range = prms.height_shift_range,
|
||||||
|
horizontal_flip=prms.horizontal_flip,
|
||||||
|
vertical_flip=prms.vertical_flip,
|
||||||
|
contrast = prms.contrast if "contrast" in prms else None,
|
||||||
|
truncate_quantile = prms.truncate_quantile,
|
||||||
|
#histeq_alpha=prms.histeq_alpha,
|
||||||
|
**norm_params)
|
||||||
|
else:
|
||||||
|
train_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
|
||||||
|
val_datagen = ImageDataGenerator(**norm_params)
|
||||||
|
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
prms.data_train,
|
||||||
|
stratify = prms.oversampling,
|
||||||
|
sampling_factor=prms.sampling_factor if prms.oversampling else None,
|
||||||
|
oversampling=prms.oversampling,
|
||||||
|
shuffle=True, **flowfromdir_params)
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
#VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
|
||||||
|
VALIDATION_STEPS = np.ceil(len(datagen_val_output.filenames)/prms['batch_size'])
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
#########################################
|
||||||
|
if prms.class_weights == 'auto':
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
else:
|
||||||
|
class_weights = prms.class_weights
|
||||||
|
|
||||||
|
model.fit_generator(datagen_train_output,
|
||||||
|
steps_per_epoch=STEPS_PER_EPOCH,
|
||||||
|
epochs=prms.nb_epoch, verbose=1,
|
||||||
|
validation_data=datagen_val_output,
|
||||||
|
validation_steps=VALIDATION_STEPS,
|
||||||
|
#class_weight='auto',
|
||||||
|
class_weight=class_weights,
|
||||||
|
callbacks=callback_list,
|
||||||
|
initial_epoch=prms.init_epoch)
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
prms.data_val, shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
print("""loss\t%.4f
|
||||||
|
accuracy\t%.4f\n""" %
|
||||||
|
tuple(model.evaluate_generator(datagen_val_output,
|
||||||
|
steps=VALIDATION_STEPS,
|
||||||
|
workers=1,
|
||||||
|
pickle_safe=True)))
|
||||||
|
|
||||||
|
|
||||||
|
#model.predict()
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,245 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Jun 9 11:00:55 2017
|
||||||
|
|
||||||
|
@author: dlituiev
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from collections import Counter
|
||||||
|
from functools import partial
|
||||||
|
from itertools import product
|
||||||
|
|
||||||
|
import keras
|
||||||
|
from keras.applications.inception_v3 import InceptionV3
|
||||||
|
from keras.preprocessing import image
|
||||||
|
from keras.models import Model
|
||||||
|
from keras.layers import Dense, GlobalAveragePooling2D, GaussianNoise, Input
|
||||||
|
from keras import backend as K
|
||||||
|
from keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping
|
||||||
|
from keras.layers import Dense, Dropout, Activation, Flatten, Lambda, BatchNormalization, Input
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
def get_num_files(parentdir):
|
||||||
|
numfiles = 0
|
||||||
|
for dd in os.scandir(parentdir):
|
||||||
|
dd = os.path.join(parentdir, dd)
|
||||||
|
if os.path.isdir(dd):
|
||||||
|
numfiles+= sum((1 for ff in os.scandir(dd)))
|
||||||
|
return numfiles
|
||||||
|
#########################################
|
||||||
|
#########################################
|
||||||
|
# SET UP THE NETWORK
|
||||||
|
#########################################
|
||||||
|
def get_model(n_classes, final_activation,
|
||||||
|
ndense=512, dropout=0.5,
|
||||||
|
weights='imagenet',
|
||||||
|
input_shape = [None, None, 3],
|
||||||
|
gaussian_noise_sigma = None,
|
||||||
|
input_tensor = None,
|
||||||
|
base_trainable=False):
|
||||||
|
|
||||||
|
if input_shape:
|
||||||
|
input_tensor = Input(shape = input_shape)
|
||||||
|
if gaussian_noise_sigma is not None:
|
||||||
|
input_tensor = GaussianNoise(gaussian_noise_sigma)(input_tensor)
|
||||||
|
# create the base pre-trained model
|
||||||
|
base_model = InceptionV3(weights=weights, include_top=False,
|
||||||
|
input_tensor = input_tensor,
|
||||||
|
)
|
||||||
|
# get third Concatenation layer and crop the network on it:
|
||||||
|
cc=0
|
||||||
|
poptherest = False
|
||||||
|
for nn, la in enumerate(base_model.layers):
|
||||||
|
if type(la) is keras.layers.Concatenate:
|
||||||
|
if cc==3:
|
||||||
|
x = la.output
|
||||||
|
break
|
||||||
|
cc+=1
|
||||||
|
base_model.layers = base_model.layers[:nn+1]
|
||||||
|
|
||||||
|
#x = [la.output for la in base_model.layers if type(la) is keras.layers.Concatenate][3]
|
||||||
|
x = GlobalAveragePooling2D()(x)
|
||||||
|
# let's add a fully-connected layer
|
||||||
|
x = Dropout(dropout)(x)
|
||||||
|
|
||||||
|
if ndense>0:
|
||||||
|
x = Dense(ndense, activation='relu')(x)
|
||||||
|
# and a logistic layer -- let's say we have 200 classes
|
||||||
|
predictions = Dense(n_classes, activation=final_activation)(x)
|
||||||
|
|
||||||
|
# this is the model we will train
|
||||||
|
model = Model(inputs=base_model.input, outputs=predictions)
|
||||||
|
|
||||||
|
# first: train only the top layers (which were randomly initialized)
|
||||||
|
# i.e. freeze all convolutional InceptionV3 layers
|
||||||
|
if not base_trainable:
|
||||||
|
for layer in base_model.layers:
|
||||||
|
layer.trainable = False
|
||||||
|
|
||||||
|
last_module_index = [nn for nn,la in enumerate(model.layers) if type(la) is keras.layers.Concatenate][-2]
|
||||||
|
|
||||||
|
for layer in model.layers[last_module_index:]:
|
||||||
|
layer.trainable = True
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def get_class_weights(datagen_val_output):
|
||||||
|
counter = Counter(datagen_val_output.classes)
|
||||||
|
print("distribution of labels in {}:\n{}".format(datagen_val_output.directory, str(counter)))
|
||||||
|
for kk,vv in counter.items():
|
||||||
|
counter[kk] = vv+1
|
||||||
|
|
||||||
|
max_val = float(max(counter.values()))
|
||||||
|
|
||||||
|
class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}
|
||||||
|
return class_weights
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def w_categorical_crossentropy(weights):
|
||||||
|
def _w_categorical_crossentropy(y_true, y_pred, weights):
|
||||||
|
nb_cl = len(weights)
|
||||||
|
final_mask = K.zeros_like(y_pred[:, 0])
|
||||||
|
y_pred_max = K.max(y_pred, axis=1)
|
||||||
|
y_pred_max = K.expand_dims(y_pred_max, 1)
|
||||||
|
y_pred_max_mat = K.equal(y_pred, y_pred_max)
|
||||||
|
for c_p, c_t in product(range(nb_cl), range(nb_cl)):
|
||||||
|
|
||||||
|
final_mask += (K.cast(weights[c_t, c_p],K.floatx()) *
|
||||||
|
K.cast(y_pred_max_mat[:, c_p] ,K.floatx()) *
|
||||||
|
K.cast(y_true[:, c_t],K.floatx())
|
||||||
|
)
|
||||||
|
return K.categorical_crossentropy(y_pred, y_true) * final_mask
|
||||||
|
|
||||||
|
ncce = partial(_w_categorical_crossentropy, weights=weights)
|
||||||
|
ncce.__name__ ='w_categorical_crossentropy'
|
||||||
|
return ncce
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import numpy as np
|
||||||
|
import keras
|
||||||
|
#csv_path = CHECKPOINTS_BASE + ".log.csv"
|
||||||
|
#csv_callback = keras.callbacks.CSVLogger(csv_path, separator=',', append=False)
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = '2'
|
||||||
|
|
||||||
|
NDENSE=256 #512
|
||||||
|
BATCH_SIZE = 128
|
||||||
|
NB_EPOCH = 20
|
||||||
|
DATA_AUGMENTATION = True
|
||||||
|
SEED=0
|
||||||
|
CLASS_MODE = 'binary' # 'categorical'
|
||||||
|
LOSS = '{}_crossentropy'.format(CLASS_MODE)
|
||||||
|
N_CLASSES = 1
|
||||||
|
FINAL_ACTIVATION = 'sigmoid'
|
||||||
|
LR = 0.0001
|
||||||
|
SAMPLEWISE_CENTER = False #True
|
||||||
|
|
||||||
|
TARGET_SIDE = 99
|
||||||
|
TARGET_SIZE = [TARGET_SIDE]*2
|
||||||
|
|
||||||
|
BASE_TRAINABLE=False
|
||||||
|
CHECKPOINT_DIR = "./modelstate_withx_negloglr{:d}_ndense{:d}_imsize{:d}{}/" .format(
|
||||||
|
int(-np.log10(LR)),
|
||||||
|
NDENSE,
|
||||||
|
TARGET_SIDE,
|
||||||
|
"" if not BASE_TRAINABLE else "_base_trainable"
|
||||||
|
)
|
||||||
|
CHECKPOINT_PATH = CHECKPOINT_DIR + 'model.{epoch:02d}-{val_loss:2f}.hdf5'
|
||||||
|
|
||||||
|
WEIGHTFILE = None # "./modelstate_withx_negloglr4_ndense256/model.39-0.060567.hdf5" # None # "./modelstate_withx/model.03-0.067136.hdf5"
|
||||||
|
# "modelstate_laplace_inv_weights_2/model.10-0.014968.hdf5" #CHECKPOINT_DIR + "model.10-0.019602.hdf5"
|
||||||
|
INIT_EPOCH=0
|
||||||
|
# indir = "/data/dlituiev/learn_spotmag_from_images/modelstate/"
|
||||||
|
# find_min_loss_checkpoint(indir)
|
||||||
|
|
||||||
|
|
||||||
|
DATA_TRAIN = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_train/'
|
||||||
|
DATA_VAL = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_test/'
|
||||||
|
SAMPLES_PER_EPOCH = get_num_files(DATA_TRAIN)
|
||||||
|
STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // BATCH_SIZE
|
||||||
|
|
||||||
|
CLASSES = ["normal", "special"]
|
||||||
|
|
||||||
|
VALIDATION_STEPS = get_num_files(DATA_VAL) // BATCH_SIZE
|
||||||
|
print('='*50)
|
||||||
|
print("validation steps", VALIDATION_STEPS)
|
||||||
|
print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
|
||||||
|
print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
|
||||||
|
print('='*50)
|
||||||
|
#########################################
|
||||||
|
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)
|
||||||
|
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
|
||||||
|
save_best_only=False, save_weights_only=False, mode='auto', period=1)
|
||||||
|
callbacks_list =[checkpoint]
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
model = get_model(n_classes=N_CLASSES,
|
||||||
|
final_activation=FINAL_ACTIVATION,
|
||||||
|
ndense=NDENSE,
|
||||||
|
dropout=0.5,
|
||||||
|
base_trainable=BASE_TRAINABLE)
|
||||||
|
|
||||||
|
|
||||||
|
#from keras.utils import plot_model
|
||||||
|
#plot_model(model, to_file='model.png')
|
||||||
|
|
||||||
|
|
||||||
|
model.compile(optimizer=Adam(lr=LR), loss=LOSS, metrics=['accuracy'],
|
||||||
|
callbacks = [csv_callback])
|
||||||
|
#########################################
|
||||||
|
if WEIGHTFILE:
|
||||||
|
print("loading weights from:\t%s" % WEIGHTFILE)
|
||||||
|
model.load_weights(WEIGHTFILE)
|
||||||
|
|
||||||
|
print('Using real-time data augmentation.')
|
||||||
|
|
||||||
|
flowfromdir_params = dict(
|
||||||
|
#color_mode = "grayscale",
|
||||||
|
target_size=TARGET_SIZE,
|
||||||
|
batch_size=BATCH_SIZE,
|
||||||
|
class_mode=CLASS_MODE,
|
||||||
|
classes=CLASSES,
|
||||||
|
seed=SEED)
|
||||||
|
|
||||||
|
train_datagen = ImageDataGenerator(
|
||||||
|
samplewise_center=SAMPLEWISE_CENTER,
|
||||||
|
samplewise_std_normalization=SAMPLEWISE_CENTER,
|
||||||
|
featurewise_center=False,
|
||||||
|
featurewise_std_normalization=False,
|
||||||
|
zca_whitening=False,
|
||||||
|
rotation_range=10,
|
||||||
|
width_shift_range=0.125,
|
||||||
|
height_shift_range=0.125,
|
||||||
|
horizontal_flip=True,
|
||||||
|
vertical_flip=False)
|
||||||
|
|
||||||
|
val_datagen = ImageDataGenerator()
|
||||||
|
|
||||||
|
datagen_train_output = train_datagen.flow_from_directory(
|
||||||
|
DATA_TRAIN, shuffle=True, **flowfromdir_params)
|
||||||
|
|
||||||
|
datagen_val_output = val_datagen.flow_from_directory(
|
||||||
|
DATA_VAL, shuffle=False, **flowfromdir_params)
|
||||||
|
|
||||||
|
class_weights = get_class_weights(datagen_val_output)
|
||||||
|
|
||||||
|
model.fit_generator(datagen_train_output,
|
||||||
|
steps_per_epoch=STEPS_PER_EPOCH,
|
||||||
|
epochs=NB_EPOCH, verbose=1,
|
||||||
|
validation_data=datagen_val_output,
|
||||||
|
validation_steps=VALIDATION_STEPS,
|
||||||
|
#class_weight='auto',
|
||||||
|
class_weight=class_weights,
|
||||||
|
callbacks=callbacks_list,
|
||||||
|
initial_epoch=INIT_EPOCH)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#model.predict()
|
||||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,23 @@
|
|||||||
|
Cython==0.27.3
|
||||||
|
h5py==2.7.0
|
||||||
|
imgaug==0.2.5
|
||||||
|
Keras==2.0.8
|
||||||
|
-e git+https://github.com/raghakot/keras-vis@40b27dfa3ecb84cdde5ec6b44251923c3266cc40#egg=keras_vis
|
||||||
|
lime==0.1.1.29
|
||||||
|
matplotlib==2.0.2
|
||||||
|
mudicom==0.1.2
|
||||||
|
numpy==1.14.0
|
||||||
|
opencv-python==3.3.0.10
|
||||||
|
pandas==0.20.2
|
||||||
|
Pillow==4.1.1
|
||||||
|
pyaml==17.7.2
|
||||||
|
-e git+https://github.com/cocodataset/cocoapi/@727b546dd9fa4e4bb113213c98a3925829fac0bf#egg=pycocotools&subdirectory=PythonAPI
|
||||||
|
pydicom==0.9.9
|
||||||
|
PyYAML==3.12
|
||||||
|
scikit-image==0.13.0
|
||||||
|
scikit-learn==0.18.1
|
||||||
|
scipy==0.19.1
|
||||||
|
seaborn==0.7.1
|
||||||
|
sklearn==0.0
|
||||||
|
tensorflow-gpu==1.4.1
|
||||||
|
tensorflow-tensorboard==0.4.0rc3
|
||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user