initial

2026-06-27 16:10:25 +08:00 · 2018-10-12 17:38:36 -07:00
commit 0fede818d7
26 changed files with 12311 additions and 0 deletions
@@ -0,0 +1,3 @@
+
+**/*.hdf5
+**/*.csv
@@ -0,0 +1,42 @@
+# Code for automatic labeling of special diagnostic mammography views from images and DICOM headers
+
+## DICOM
+### Extract selected fields from DICOM headers
+
+    dicom_header_extraction/extract_dicom_headers_w_generator_150K.py
+
+### Normalize / expand data
+
+    dicom_header_extraction/normalize_selected_dcm_headers.py
+
+###  Machine learning on DICOM headers
+
+    caret_on_headers.R       # most methods 
+    caret_on_headers_nona.R  # GLMNET
+
+## Image pipeline
+
+### General image model
+- scripts and config files: `image_classifiers/e5ce2d69b035975cb5336cec0da9a32a`
+
+- weight files:
+
+### Wire localization model
+
+- scripts and config files: `image_classifiers/e8e71fc090141d7c6fb334359152d295`
+
+- weight files:
+
+
+## Visualization of performance metrics 
+Scripts used to generate Fig. 1
+
+    combine_predictions_hdr_and_img.ipynb
+    visualize_predictions_hdr_and_img.ipynb
+
+
+## Significance tests
+Scripts used to generate Supplementary Figures S1 & S2
+
+    calc_auroc_confidence_intervals.R
+    plot_auroc_difference_pvalue.ipynb
@@ -0,0 +1,169 @@
+rm(list=ls())
+library(pROC)
+library(ggplot2)
+library(ggsignif)
+library(dplyr)
+library(data.table)
+read.gz <- function(filename, ...){
+  as.data.frame(fread(paste("zcat < ",filename),
+                      header=TRUE,  fill = TRUE, ...))
+}
+
+
+tag <- "e5ce2d69b035975cb5336cec0da9a32a"
+fnall <- "../tables/all_predictions_with_images.tab"
+fnall <- paste0("../tables/all_predictions_with_images-", tag,".tab")
+
+predictions <- as.data.frame(fread(fnall, sep='\t'), header=TRUE,  fill = TRUE)
+
+labelled <- sapply(predictions$label, function(x) nchar(x)>0)
+
+print(nrow(predictions[labelled,]))
+predictions <- predictions[labelled,]
+
+
+predictions[,'ViewModifier'] <- as.numeric(predictions[,'ViewModifier']!='')
+
+predictions[, "label"] <- factor(predictions[, "label"], c('normal', 'special'))
+
+predictions[,"view"] <- factor(predictions[,"view"], c('N','M','T','W','X'))
+head(predictions)
+# holdout <- predictions[predictions$set == 'val',]
+
+ggplot(holdout, aes(view, `score_max_wire_image+gbmt`)) + geom_point()
+
+validation <- predictions[predictions$set == 'test',]
+
+clmns <- colnames(predictions)
+
+othercols <- c('id', 'set', 'view', 'label')
+modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
+                'image',
+                'image_max',
+                'wire',
+                'wire_max',
+                'max_image_wire_max',
+                'image+gbmt',
+                'max_wire_max_image+gbmt',
+                'max_image_wire',
+                'max_wire_image+gbmt')
+
+
+
+clean_score_names <- function(x){
+  return( gsub('score_', '', x) )
+  # paste(strsplit(x, '_')[[1]][-1],collapse='_')
+}
+
+clmns_clean <-  vapply(clmns, clean_score_names, '')
+
+cols_ <-  factor(vapply(colnames(predictions) , clean_score_names, ''),
+                 c(othercols,modelnames))
+
+colnames(validation) <-  cols_
+
+validation <- validation[,!is.na(colnames(validation))]
+
+cols_ <- cols_[!is.na(cols_)]
+cols_ <- cols_[order(cols_)]
+
+validation <- validation[,as.character(cols_)]
+
+colnames(validation)
+# clmns <-clmns[vapply(clmns, function(x) strsplit(x, '_')[[1]][1]=='score', TRUE)]
+
+## Perform McNemars test for prediction difference ----------------------------------------------------
+
+mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$max_image_wire_max>0.5))
+
+mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$gbmt>0.5))
+
+## Calculate significance of pairwise auROC differences -----------------------------------------------
+cis <- list()
+rocobjects <- list()
+ii <- 0
+for (clmn in modelnames){
+  # ii = 1
+  print('====================')
+  print(clmn)
+  rocobj   <- plot.roc(  validation[, "label"],
+                         validation[,clmn],
+                         levels = (levels(validation[, "label"])),
+                         xlim = c(100,0),
+                         ylim = c(0,100),
+                         percent=TRUE,
+                         print.auc=TRUE)
+  rocobjects[[clmn]] <- rocobj
+  cis[[clmn]] <- ci(rocobj, of="auc", thresholds="best")
+}
+
+## Wire model on wire cases
+for (clmn in c('wire', 'wire_max')){
+  print('====================')
+  print(clmn)
+  rocobj   <- plot.roc(  validation[, "view"]=='W',
+                         validation[,clmn],
+                         # levels = (levels(validation[, "label"])),
+                         xlim = c(100,0),
+                         ylim = c(0,100),
+                         percent=TRUE,
+                         print.auc=TRUE)
+  rocobjects[[clmn]] <- rocobj
+  cis[[paste0(clmn, ' (vs other views)')]] <- ci(rocobj, of="auc", thresholds="best")
+}
+###
+modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
+                'image', "image_max",
+                'wire', 'wire_max',
+                'wire (vs other views)', 'wire_max (vs other views)',
+                'max_image_wire_max',
+                'image+gbmt',
+                'max_wire_max_image+gbmt')
+
+##
+
+dfcis <- as.data.frame(t(do.call(cbind.data.frame, lapply(cis, as.vector))))
+colnames(dfcis) <- c('lower', 'auROC', 'upper')
+
+dfcis[,"model"] <- factor(rownames(dfcis),
+                           modelnames)
+
+dfcis <-  dfcis[!is.na(dfcis[,"model"]),]
+
+rownames(dfcis) <- dfcis[,"model"] 
+
+dfcis <- dfcis[modelnames,]
+
+
+# dfcis <-dfcis %>% mutate(model = factor(model, levels=rev(levels(model))))
+dfcis_nowire <- dfcis[!(rownames(dfcis) %in% c('wire','wire_max')),]
+dfcis_nowire$model <-  factor(dfcis_nowire$model)
+# 
+# 
+# annotation_df <- data.frame(color=c("E", "H"), 
+#                             start=c("Good", "Fair"), 
+#                             end=c("Very Good", "Good"),
+#                             y=c(3.6, 4.7),
+#                             label=c("Comp. 1", "Comp. 2"))
+
+roc.test(rocobjects[["ViewModifier"]], rocobjects[["gbmt"]])
+
+## Format Pairwise comparisons
+
+keys <- names(rocobjects)
+dfcompar <- data.frame()
+for (a in 1:length(rocobjects)){
+  for (b in 1:a){
+    na <- keys[a]
+    nb <- keys[b]
+    if ((as.numeric(rocobjects[[na]]$auc)==100)||(as.numeric(rocobjects[[nb]]$auc)==100)){
+      dfcompar[na, nb] <- NA
+    } else {
+      dfcompar[na, nb] <- roc.test(rocobjects[[na]], rocobjects[[nb]], method='delong')$p.value
+    }
+  }
+}
+
+
+fn.comparison <- paste0("../tables/auroc_delong_comparison-", tag,".csv")
+write.csv(dfcompar, file=fn.comparison)
@@ -0,0 +1,284 @@
+# coding: utf-8
+rm(list=ls())
+
+library(caret)
+library(gbm3)
+library(data.table)
+library(ggplot2)
+library(fastmatch)
+
+read.gz <- function(filename, ...){
+  as.data.frame(fread(paste("zcat < ",filename),
+                            header=TRUE,  fill = TRUE, ...))
+}
+
+TABLEDIR = "../tables/"
+fn_ids = paste(TABLEDIR,
+               "2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz", sep='/')
+
+ids = read.gz(fn_ids, select="id")$id
+
+fn_features = paste(TABLEDIR, "mammo_dicom_headers/df_all_mammos_dicom_headers_selected_expanded.tab.gz", sep='/')
+dffeatures = read.gz(fn_features, sep='\t')
+print(nrow(dffeatures))
+print(length(ids))
+
+dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
+dffeatures <- dffeatures[!is.na(dffeatures$filename),]
+rm(ids)
+
+# Data formatting -----------------------------------------
+
+collist = c("BodyPartThickness", "XRayTubeCurrentInuA",  "ContentTime",
+            "DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
+for (cc in collist){
+    dffeatures[,cc] <- as.numeric(dffeatures[,cc])
+}
+
+
+dtypes = sapply(dffeatures, class)
+names(dtypes[dtypes == 'character'])
+
+
+row.names(dffeatures) = dffeatures$filename
+excludeCols <- c("filename",
+                 "CollimatorLeftVerticalEdge",
+                 "CollimatorLowerHorizontalEdge",
+                 "DistanceSourceToEntrance",
+                 "ExposuresOnDetectorSinceLastCalibration",
+                 "ExposuresOnDetectorSinceManufactured",
+                 "ShutterLowerHorizontalEdge",     
+                 "ShutterRightVerticalEdge",
+                 "XRayTubeCurrentInuA"
+                 # "ManufacturerModelName"
+                )
+dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
+
+
+catcols <- c('ViewModifierCodeMeaning',
+            'ViewCodeValue',
+            'DetectorActiveDimensionsMissing',
+            'FieldOfViewOriginMissing',
+            'Grid',
+            'Manufacturer',
+            'ManufacturerModelName')
+
+for (cc in catcols){
+  dffeatures[,cc] = as.factor(dffeatures[,cc])
+}
+#cell#
+
+colSums(sapply(dffeatures, is.na))
+
+# Read labels  --------------------------------
+
+
+fn.labelledset = paste(TABLEDIR, "spotmag_predictions/train_test_split-2018-02-15-within7e5.csv", sep='/')
+# filelist.labelled = read.table(fn.labelledset, )
+df.labelled = as.data.frame(fread(fn.labelledset))
+rownames(df.labelled) <- df.labelled$id
+vec.labelled = df.labelled$id
+df.labelled$label <- as.factor(df.labelled$label)
+
+#cell#
+
+vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
+vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
+vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
+
+############################################################
+
+dffeatures.labelled <- dffeatures[vec.labelled,]
+dffeatures.labelled$label  <- df.labelled$label
+
+#cell#
+
+dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
+dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
+dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
+
+colnames(dffeatures.labelled.tr_set)
+
+
+for (cc in colnames(dffeatures.labelled.tr_set)){
+  if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
+    setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
+    if (length(setdiff_)>0){
+      print(cc)
+      print(setdiff_)
+    }
+  }
+}
+
+
+
+
+# GBM3 ----------------------------------------
+
+par_detail <- gbmParallel(num_threads = 4) # Pass to par_details in gbmt
+gbmt_fit <- gbmt(label ~ .,
+                  data = dffeatures.labelled.tr_set,
+                  cv_folds = 10,
+                  # training_params = training_params(num_trees = 100, 
+                  #                                   interaction_depth = 1,
+                  #                                 min_num_obs_in_node = 10, 
+                  #                                 shrinkage = 0.005, 
+                  #                                 bag_fraction = 0.5,
+                  #                                 num_features = 2),
+                  keep_gbm_data = TRUE,
+                  par_detail=par_detail)
+
+best_iter_cv <- gbmt_performance(gbmt_fit, method='cv')
+plot(best_iter_cv)
+
+best.iter.oob <- gbmt_performance(gbmt_fit,method="OOB")  # returns out-of-bag estimated best number of trees
+plot(best.iter.oob)
+
+saveRDS(gbmt_fit, sprintf("gbm3_ntrees_%d_%s.rds", best_iter_cv, Sys.Date()))
+
+## Feature Importance Plotting ----------------
+
+infl_gbmt <- (as.data.frame(relative_influence(gbmt_fit, best_iter_cv, rescale=T)))
+colnames(infl_gbmt) <- "relative influence"
+infl_gbmt[,"variable"] <- rownames(infl_gbmt)
+
+infl_gbmt = infl_gbmt[infl_gbmt$`relative influence` >0,]
+
+plimp <- ggplot(data=infl_gbmt) +
+  geom_segment(size=5, colour='blue') + 
+  aes(x=reorder(variable,`relative influence`),
+      xend = variable,
+      y = 2e-6,
+      yend=`relative influence`,
+      label=`relative influence`) +
+  scale_y_log10() + 
+  # coord_cartesian(ylim= c(0.8e-6, 1.05)) +
+  ylab("relative influence") + xlab("") +
+  coord_flip() +
+  theme(axis.text.y = element_text(colour="black",size=16,angle=0,face="plain"),
+        axis.text.x = element_text(colour="black",size=16,angle=0,face="plain"),
+        axis.title.x = element_text(colour="black",size=16,angle=0,face="plain"),
+        # panel.background = element_rect(fill = "transparent"), # bg of the panel
+        #plot.background = element_rect(fill = "transparent"), # bg of the plot
+        # panel.grid.major = element_blank(), # get rid of major grid
+         # , panel.grid.minor = element_blank() # get rid of minor grid
+          , legend.background = element_rect(fill = "transparent") # get rid of legend bg
+          , legend.box.background = element_rect(fill = "transparent") # get rid of legend panel bg
+        )
+
+plimp + coord_trans(limy= c(0.5e-6, 1.05)) + coord_flip()
+  
+plimp + ggsave("img/xgbt_importances.eps", device = 'eps', bg = "transparent",
+               width = 8, height = 6, dpi = 300, units = "in" )
+plimp + ggsave("img/xgbt_importances.png", device = 'png', bg = "transparent",
+               width = 8, height = 6, dpi = 300, units = "in" )
+
+
+dffeatures[,"predictions_gbmt"] = predict(gbmt_fit, newdata = dffeatures,
+                                          n.trees = best_iter_cv,
+                                          type = "response", na.action = na.pass)
+
+# GBM-CARET ---------------------------------------------------
+
+control <- trainControl(method = "cv",
+                        number = 10, 
+                        p =.8, 
+                        savePredictions = TRUE, 
+                        classProbs = TRUE, 
+                        summaryFunction = twoClassSummary)
+
+tuneGrid <- expand.grid(n.trees = c(80,100,120,140,160),
+            shrinkage=c(0.025, 0.05, 0.1, 0.2),
+            interaction.depth = c(1,2),
+            n.minobsinnode = c(10, 15))
+
+gbmFit1 <- train(label ~ .,
+                 data = dffeatures.labelled.tr_set, 
+                 method = "gbm",
+                 na.action = na.pass,
+                 tuneGrid=tuneGrid,
+                 ## This last option is actually one
+                 ## for gbm() that passes through
+                 metric = "ROC",
+                 trControl = control,
+                 # importance = TRUE,
+                 verbose = FALSE)
+gbmFit1
+
+## Feature Importance Plotting ---------------------------------------------
+
+gbmsmmry <- summary(gbmFit1, normalize=T, plotit=F)
+
+gbmsmmry <- gbmsmmry[gbmsmmry$rel.inf>0,]
+
+
+ggplot(data=gbmsmmry) +
+  geom_segment(size=3, colour='red') + 
+  aes(x=reorder(var,rel.inf, sum),
+      xend = var,
+      y = 0.002,
+      yend=(rel.inf),
+      label=rel.inf) +
+  scale_y_log10() + 
+  ylab("relative influence") + xlab("") +
+  coord_flip()
+
+saveRDS(gbmFit1, "gbm_ntrees80_interactiondepth2_shrinkage0.2_nminobsinnode15_trainset_2018-02-18.rds")
+
+dffeatures[,"predictions_gbm"] = predict(gbmFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
+
+# RPART -----------------------------------------------------------------
+
+tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
+
+rpartFit1 <- train(label ~ ., data = dffeatures.labelled.tr_set, 
+                   method = "rpart",
+                   na.action = na.pass,
+                   tuneGrid=tuneGrid,
+                   ## This last option is actually one
+                   ## for gbm() that passes through
+                   metric = "ROC",
+                   trControl = control
+)
+varImp(rpartFit1)
+
+
+predictions.ts_set = predict(rpartFit1, 
+                             newdata = dffeatures.labelled.ts_set,
+                             type='prob', na.action = na.pass)
+
+dffeatures[,"predictions_rpart"] = predict(rpartFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
+
+# XGB ---------------------------------------------------------------------
+control <- trainControl(method="cv", number=10)
+#classProbs = TRUE
+
+#tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
+xgbFit <- train(label ~ ., data = dffeatures.labelled.tr_set, 
+                   method = "xgbTree",
+                   na.action = na.pass,
+                   #tuneGrid=tuneGrid,
+                   metric = "Accuracy",
+                   trControl = control)
+
+varImp(xgbFit, scale=T)
+
+as.data.frame(xgbFit$finalModel$params)
+
+xgbFit$bestTune
+
+saveRDS(xgbFit, sprintf("xgbtree_maxdepth1_subsample1_eta0.3_%s.rds", Sys.Date()))
+
+predictions.ts_set = predict(xgbFit, 
+                             newdata = dffeatures.labelled.ts_set,
+                             type='prob', na.action = na.pass)
+
+
+## Save all predictions  ---------------------------------------------------------
+
+dffeatures[,"predictions_xgb"] = predict(xgbFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
+
+write.table(dffeatures[, c(grep('prediction',colnames(dffeatures), value=T),
+                           "ViewModifierCodeMeaning", "ViewCodeValue")],
+            file = "all_predictions_allmodels_trained_on_train.tab", quote=F, sep='\t')
+
@@ -0,0 +1,170 @@
+# coding: utf-8
+############################################################################
+# stratify by BT column: those are 100% sure digital, others can be either
+############################################################################
+rm(list=ls())
+setwd(dir = "~/repos/mammo/learn_spotmag_from_dicom_headers")
+#cell#
+library(caret)
+library(data.table)
+
+library(pROC)
+# install.packages(c("pROC"))
+library(ggplot2)
+library(fastmatch)
+
+read.gz <- function(filename, ...){
+  as.data.frame(fread(paste("zcat < ",filename),
+                      header=TRUE,  fill = TRUE, ...))
+}
+
+
+fn_ids = "../tables/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz"
+ids = read.gz(fn_ids, select="id")$id
+
+fn_features = "../tables/mammo_dicom_headers/df_all_mammos_dicom_headers_selected_nona.tab.gz"
+dffeatures = read.gz(fn_features, sep='\t')
+
+# rownames(dffeatures) <- dffeatures$filename
+print(nrow(dffeatures))
+print(length(ids))
+
+dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
+dffeatures <- dffeatures[!is.na(dffeatures$filename),]
+
+rm(ids)
+
+collist = c("BodyPartThickness", "XRayTubeCurrentInuA",  "ContentTime",
+            "DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
+for (cc in collist){
+  dffeatures[,cc] <- as.numeric(dffeatures[,cc])
+}
+
+
+
+# (head(as.numeric(dffeatures$BodyPartThickness)))
+dtypes = sapply(dffeatures, class)
+
+row.names(dffeatures) = dffeatures$filename
+excludeCols <- c("filename",
+                 "CollimatorLeftVerticalEdge",
+                 "CollimatorLowerHorizontalEdge",
+                 "DistanceSourceToEntrance",
+                 "ExposuresOnDetectorSinceLastCalibration",
+                 "ExposuresOnDetectorSinceManufactured",
+                 "ShutterLowerHorizontalEdge",     
+                 "ShutterRightVerticalEdge",
+                 "XRayTubeCurrentInuA"
+                 # "ManufacturerModelName"
+)
+dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
+
+
+catcols <- c('ViewModifierCodeMeaning',
+             'ViewCodeValue',
+             'DetectorActiveDimensionsMissing',
+             'FieldOfViewOriginMissing',
+             'Grid',
+             'Manufacturer',
+             'ManufacturerModelName')
+
+for (cc in catcols){
+  dffeatures[,cc] = paste0("=", dffeatures[,cc])
+  dffeatures[,cc] = as.factor(dffeatures[,cc])
+}
+
+dffeatures[,"HighBit"] <- as.numeric(dffeatures[,"HighBit"])
+
+colSums(sapply(dffeatures, is.na))
+
+# Read labels ---------------------------------
+
+fn.labelledset = "../tables/spotmag_predictions/train_test_split-2018-02-15-within7e5.csv"
+# filelist.labelled = read.table(fn.labelledset, )
+df.labelled = as.data.frame(fread(fn.labelledset))
+rownames(df.labelled) <- df.labelled$id
+vec.labelled = df.labelled$id
+df.labelled$label <- as.factor(df.labelled$label)
+
+#cell#
+
+vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
+vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
+vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
+############################################################
+dffeatures.labelled <- dffeatures[vec.labelled,]
+dffeatures.labelled$label  <- df.labelled$label
+
+dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
+dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
+dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
+
+table(dffeatures.labelled.tr_set$label)
+
+
+goodrows <- 1 - colSums(sapply(dffeatures.labelled.tr_set, is.na)) / nrow(dffeatures.labelled.tr_set)
+
+names(goodrows[goodrows<0.1])
+
+
+for (cc in colnames(dffeatures.labelled.tr_set)){
+  if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
+    setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
+    if (length(setdiff_)>0){
+      print(cc)
+      print(setdiff_)
+    }
+  }
+}
+
+
+# GLMNET ---------------------------------------------------------------------
+
+library(glmnet)
+# Using glmnet to directly perform CV
+set.seed(0)
+
+x_train <- model.matrix( ~ .-1, dffeatures.labelled.tr_set[,!(colnames(dffeatures.labelled.tr_set) %in% c("label"))])
+dim(x_train)
+
+cvob1=cv.glmnet(x=x_train,
+                y=dffeatures.labelled.tr_set[,"label"],
+                family="binomial",alpha=1, 
+                type.measure="auc", nfolds = 5, lambda = seq(0.001,0.1,by = 0.001),
+                standardize=FALSE)
+plot(cvob1)
+
+control <- trainControl(method="cv", number=5, returnResamp="all",
+                        classProbs=TRUE, summaryFunction=twoClassSummary)
+#classProbs = TRUE
+
+tuneGrid <- expand.grid(alpha=c(0.00, 0.25, 0.50, 0.75, 0.99, 1.00), lambda = 10^seq(-5,-2,0.5))
+tune = list()
+fits = list()
+rocs = list()
+for (ii in 1:5){
+    glmnetFit <- train(label ~ ., data = dffeatures.labelled.tr_set, 
+                       method = "glmnet",
+                       na.action = na.pass,
+                       tuneGrid=tuneGrid,
+                       metric = "ROC",
+                       trControl = control)
+    fits[[ii]] <- glmnetFit
+    tune[[ii]] <- glmnetFit$bestTune
+    rocs[[ii]] <- max(glmnetFit$results$ROC)
+}
+
+tune
+
+varImp(glmnetFit, scale=T)
+as.data.frame(glmnetFit$bestTune)
+
+saveRDS(glmnetFit, sprintf("glmnet.rds", Sys.Date()))
+
+## Save predictions  ---------------------------------------------------------
+
+dffeatures[,"predictions_glmnet"] = predict(glmnetFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
+
+write.table(dffeatures[,c("predictions_glmnet"), drop=F],
+            file="all_predictions_glmnet.tab", quote=F, sep='\t')
+
@@ -0,0 +1,763 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
+      "  return f(*args, **kwds)\n",
+      "/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
+      "  return f(*args, **kwds)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "tabledir = \"../tables/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(772423, 1)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fn = f\"{tabledir}/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz\"\n",
+    "df_bt = pd.read_csv(fn, usecols=[\"id\", \"BT_case\"])\n",
+    "df_bt.set_index(\"id\", inplace=True)\n",
+    "df_bt = ~df_bt.isnull()\n",
+    "df_bt.columns = [\"digital\"]\n",
+    "df_bt.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>set</th>\n",
+       "      <th>label</th>\n",
+       "      <th>view</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1013372709_1.2.840.113654.2.70.1.175625299786291545159233542096043464711_3_1</th>\n",
+       "      <td>test</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1028995243_1.2.840.113654.2.70.1.56947963181878834591544466761404805157_45576_2</th>\n",
+       "      <td>test</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1105112884_1.2.840.113654.2.70.1.178729598744204462442695104630823323474_8905_2</th>\n",
+       "      <td>test</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1185125156_1.2.840.113654.2.70.1.45840593750642722243371816041014016032_2_4</th>\n",
+       "      <td>test</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1496452586_1.2.840.113654.2.70.1.5582568668770891599992528318631583880_1351_4</th>\n",
+       "      <td>test</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                     set   label view\n",
+       "id                                                                   \n",
+       "1013372709_1.2.840.113654.2.70.1.17562529978629...  test  normal    N\n",
+       "1028995243_1.2.840.113654.2.70.1.56947963181878...  test  normal    N\n",
+       "1105112884_1.2.840.113654.2.70.1.17872959874420...  test  normal    N\n",
+       "1185125156_1.2.840.113654.2.70.1.45840593750642...  test  normal    N\n",
+       "1496452586_1.2.840.113654.2.70.1.55825686687708...  test  normal    N"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "infile = f\"{tabledir}/spotmag_predictions/train_test_split-2018-02-16-within7e5-label.csv\"\n",
+    "dflab = pd.read_csv(infile, index_col='id')\n",
+    "dflab[:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read header-based predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(772367, 1)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "infile = f\"{tabledir}/spotmag_predictions/all_predictions_glmnet.tab\"\n",
+    "dfpred_glmnet = pd.read_table(infile, index_col=0)\n",
+    "dfpred_glmnet.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred_glmnet.columns]\n",
+    "dfpred_glmnet.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(772367, 5)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>score_gbm</th>\n",
+       "      <th>score_xgb</th>\n",
+       "      <th>score_rpart</th>\n",
+       "      <th>score_xgbt</th>\n",
+       "      <th>ViewModifierCodeMeaning</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104556</th>\n",
+       "      <td>0.009005</td>\n",
+       "      <td>0.020207</td>\n",
+       "      <td>0.006882</td>\n",
+       "      <td>0.059474</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104557</th>\n",
+       "      <td>0.013337</td>\n",
+       "      <td>0.016762</td>\n",
+       "      <td>0.006882</td>\n",
+       "      <td>0.059660</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141538</th>\n",
+       "      <td>0.013337</td>\n",
+       "      <td>0.016762</td>\n",
+       "      <td>0.006882</td>\n",
+       "      <td>0.061051</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141537</th>\n",
+       "      <td>0.013337</td>\n",
+       "      <td>0.016762</td>\n",
+       "      <td>0.006882</td>\n",
+       "      <td>0.061051</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3337971863_1.2.840.113654.2.70.1.337982194343327746313656933304494759333_1_1</th>\n",
+       "      <td>0.031560</td>\n",
+       "      <td>0.059142</td>\n",
+       "      <td>0.006882</td>\n",
+       "      <td>0.157488</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                    score_gbm  score_xgb  \\\n",
+       "id                                                                         \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.009005   0.020207   \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.013337   0.016762   \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.013337   0.016762   \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.013337   0.016762   \n",
+       "3337971863_1.2.840.113654.2.70.1.33798219434332...   0.031560   0.059142   \n",
+       "\n",
+       "                                                    score_rpart  score_xgbt  \\\n",
+       "id                                                                            \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.059474   \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.059660   \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.061051   \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.061051   \n",
+       "3337971863_1.2.840.113654.2.70.1.33798219434332...     0.006882    0.157488   \n",
+       "\n",
+       "                                                   ViewModifierCodeMeaning  \n",
+       "id                                                                          \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
+       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
+       "3337971863_1.2.840.113654.2.70.1.33798219434332...                     NaN  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "infile = f\"{tabledir}/spotmag_predictions/all_predictions_allmodels_trained_on_train.tab\"\n",
+    "dfpred = pd.read_table(infile, index_col=0)\n",
+    "dfpred.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred.columns]\n",
+    "dfpred.index.name = 'id'\n",
+    "print(dfpred.shape)\n",
+    "dfpred[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(772367, 8)\n"
+     ]
+    }
+   ],
+   "source": [
+    "if 'set' not in dfpred.columns:\n",
+    "    dfpred = dfpred.merge(dflab,  left_index=True, right_index=True, how='left')\n",
+    "    print(dfpred.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "normal          3526\n",
+       "magn/spot        572\n",
+       "wire loc          57\n",
+       "stereotactic      25\n",
+       "other              9\n",
+       "Name: view, dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "colmap = {\"N\":\"normal\", \"M\": \"magn/spot\",\n",
+    "          \"T\":\"stereotactic\", \"W\":\"wire loc\", \"X\":\"other\"}\n",
+    "view_counts = dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]).value_counts()\n",
+    "view_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>set</th>\n",
+       "      <th>train</th>\n",
+       "      <th>test</th>\n",
+       "      <th>val</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>view</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>magn/spot</th>\n",
+       "      <td>380</td>\n",
+       "      <td>96</td>\n",
+       "      <td>96</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>normal</th>\n",
+       "      <td>2310</td>\n",
+       "      <td>612</td>\n",
+       "      <td>604</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>other</th>\n",
+       "      <td>4</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>stereotactic</th>\n",
+       "      <td>17</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>wire loc</th>\n",
+       "      <td>37</td>\n",
+       "      <td>11</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "set           train  test  val\n",
+       "view                          \n",
+       "magn/spot       380    96   96\n",
+       "normal         2310   612  604\n",
+       "other             4     3    2\n",
+       "stereotactic     17     4    4\n",
+       "wire loc         37    11    9"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.crosstab(dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]), dfpred.set)[[\"train\", \"test\", \"val\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read image-based predictions (general)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "../tables//spotmag_predictions/predictions_images_4189-epoch55-e5ce2d69b035975cb5336cec0da9a32a.csv\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Index(['score_image', 'score_image_max'], dtype='object')"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tag = \"e5ce2d69b035975cb5336cec0da9a32a\"\n",
+    "epoch = 55\n",
+    "infile = f\"{tabledir}/spotmag_predictions/predictions_images_4189-epoch{epoch}-{tag}.csv\"\n",
+    "# infile = f\"{tabledir}/spotmag_predictions/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl-spotmag_img_prediction-{tag}.csv\"\n",
+    "print(infile)\n",
+    "dfpred_img = pd.read_csv(infile, index_col=0)\n",
+    "dfpred_img = dfpred_img[['score_image', 'score_image_max']]\n",
+    "dfpred_img = dfpred_img.groupby(level=0).mean()\n",
+    "dfpred_img.columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read image-based predictions (wire localization)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "infile = f\"{tabledir}/spotmag_predictions/predictions_wire_combined_e8e71fc090141d7c6fb334359152d295.csv\"\n",
+    "\n",
+    "dfpred_imgwire = pd.read_csv(infile, index_col=0)\n",
+    "dfpred_imgwire[\"score_wire_max\"] = 1-dfpred_imgwire[[\"scores_0_or\",\"scores_0_fl\"]].min(1)\n",
+    "dfpred_imgwire = dfpred_imgwire.drop([\"scores_0_or\",\"scores_0_fl\", \"label\"], axis=1)\n",
+    "dfpred_imgwire.columns = [cc.replace(\"scores\", \"score_wire\") for cc in dfpred_imgwire.columns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(772367, 13)"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "if 'score_image' not in dfpred.columns:\n",
+    "    dfpred = pd.concat([dfpred, dfpred_img], axis=1)\n",
+    "    dfpred.index.name = 'id'\n",
+    "    del dfpred_img\n",
+    "    \n",
+    "if 'score_glmnet' not in dfpred.columns:\n",
+    "    dfpred = pd.concat([dfpred, dfpred_glmnet], axis=1)\n",
+    "    dfpred.index.name = 'id'\n",
+    "    del dfpred_glmnet\n",
+    "    \n",
+    "if 'score_wire' not in dfpred.columns:\n",
+    "    dfpred = pd.concat([dfpred, dfpred_imgwire], axis=1)\n",
+    "    dfpred.index.name = 'id'\n",
+    "    del dfpred_imgwire\n",
+    "\n",
+    "dfpred.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "if 'label' not in dfpred.columns:\n",
+    "    dfpred = pd.concat([dfpred, dflab], axis=1)\n",
+    "if 'digital' not in dfpred.columns:\n",
+    "    dfpred = pd.concat([dfpred, df_bt], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>score_image</th>\n",
+       "      <th>False</th>\n",
+       "      <th>True</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>score_wire</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>False</th>\n",
+       "      <td>3584</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>605</td>\n",
+       "      <td>768234</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "score_image  False   True \n",
+       "score_wire                \n",
+       "False         3584       0\n",
+       "True           605  768234"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.crosstab(dfpred[\"score_wire\"].isnull(), dfpred[\"score_image\"].isnull())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "dfpred.rename(columns={\"score_xgbt\":\"score_gbmt\"}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add ensembled (max, avg) scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "dfpred['score_wire'] = dfpred['score_wire'].fillna(0)\n",
+    "dfpred['score_wire_max'] = dfpred['score_wire_max'].fillna(0)\n",
+    "dfpred['score_image+glmnet'] = (dfpred['score_image'] + dfpred['score_glmnet'])/2\n",
+    "dfpred['score_image+gbmt'] = (dfpred['score_image'] + dfpred['score_gbmt'])/2\n",
+    "\n",
+    "dfpred['score_max(image;gbmt)'] = dfpred[['score_image','score_gbmt']].max(1)\n",
+    "\n",
+    "dfpred['score_image*glmnet'] = np.sqrt(dfpred['score_image'] * dfpred['score_glmnet'])\n",
+    "dfpred['score_image*gbmt'] = np.sqrt(dfpred['score_image'] * dfpred['score_gbmt'])\n",
+    "dfpred['score_max_image_wire'] = np.nanmax(dfpred[['score_image','score_wire']].values, axis=1)\n",
+    "dfpred['score_max_image_wire_max'] = np.nanmax(dfpred[['score_image','score_wire_max']].values, axis=1)\n",
+    "# dfpred['score_wire'].isnull()\n",
+    "dfpred['score_max_image_wire+gbmt'] =(dfpred['score_max_image_wire'] + dfpred['score_gbmt'])/2\n",
+    "\n",
+    "dfpred['score_max_image_wire_max+gbmt'] =(dfpred['score_max_image_wire_max'] + dfpred['score_gbmt'])/2\n",
+    "\n",
+    "dfpred['score_max(image;wire_max;gbmt)'] = dfpred[['score_wire_max','score_gbmt', 'score_image']].max(1)\n",
+    "\n",
+    "dfpred['score_max_wire_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire']].values, axis=1)\n",
+    "\n",
+    "dfpred['score_max_wire_max_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire_max']].values, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "dfpred.rename(columns={\"ViewModifierCodeMeaning\":\"ViewModifier\"}, inplace=True)\n",
+    "dfpred.index.name = 'id'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save the combined table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "772423"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dfpred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "dfpred.to_csv(f'{tabledir}/all_predictions_with_images-{tag}.tab', sep='\\t')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,98 @@
+# coding: utf-8
+import numpy as np
+import pandas as pd
+import dicom
+from warnings import warn
+
+def get_tuples(plan, outlist = None, key = ""):
+    if len(key)>0:
+        key =  key + "_"
+    if not outlist:
+        outlist = []
+    for aa  in plan.dir():
+        if (hasattr(plan, aa) and aa!='PixelData'):
+            value = getattr(plan, aa)
+            if type(value) is dicom.sequence.Sequence:
+#                 if len(list(value))==1:
+#                     outlist.extend(get_tuples(list(value)[0], outlist = None, key = key+aa))
+#                 else:
+                for nn, ss in enumerate(list(value)):
+                    newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa])
+                    outlist.extend(get_tuples(ss, outlist = None, key = newkey))
+            else:
+                if type(value) is dicom.valuerep.DSfloat:
+                    value = float(value)
+                elif type(value) is dicom.valuerep.IS:
+                    value = str(value)
+                elif type(value) is dicom.valuerep.MultiValue:
+                    value = tuple(value)
+                elif type(value) is dicom.UID.UID:
+                    value = str(value)
+                outlist.append((key + aa, value))
+    return outlist
+
+
+def filter_row_common_field(row, common_fields):
+    for kk in list(row.keys()):
+        if kk not in common_fields:
+            row.pop(kk)
+    return row
+
+
+
+"""
+fn_allheaders = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/dicom_headers_all_fields_filelist_nonscreening_4000_seed42.csv'
+
+df_allheaders = pd.read_csv(fn_allheaders, index_col=0)
+
+
+"at least 5% of rows are there"
+thr = 0.05
+valid_fields = (~df_allheaders.isnull()).mean() > thr
+valid_fields = valid_fields[valid_fields].index.tolist()
+print(len(valid_fields))
+"""
+
+valid_fields = pd.read_table("/data/dlituiev/learn_spotmag_from_dicom_headers/LogisticRegression_common_fields_names.tab", 
+                             header=None,
+                            squeeze=True).values
+
+
+#filelist_fn = '/home/dlituiev/data_dlituiev/tables/df_newest_mammos.pickle'
+filelist_fn = "/home/dlituiev/data_dlituiev/tables/2017-06-mammo_tables/df_original_mammos.pickle"
+filelist = pd.read_pickle(filelist_fn, )["Filename"].unique().tolist()
+len(filelist)
+
+BUFFER_N_LINES = 100
+SEP = '\t'
+outpath = filelist_fn.replace('.pickle','') + '_dicom_headers_selected.tab'
+final_columns = ['filename'] + list(valid_fields)
+print("len(final_columns)", len(final_columns) )
+print('saving to %s' % outpath)
+with open(outpath, 'w+') as outfh:
+    outfh.write(SEP.join(final_columns) + '\n')
+    headerlist = []
+    for nn, ff in enumerate(filelist):
+        if nn% BUFFER_N_LINES == (BUFFER_N_LINES-1):
+            df_hl = pd.DataFrame( headerlist, columns=final_columns)
+            df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
+            outfh.flush()
+            del df_hl
+            print(nn+1)
+            headerlist = []
+        try:
+            plan = dicom.read_file(ff)
+            row = get_tuples(plan)
+            row = dict(row)
+            row = tuple([ff] + [(row[kk] if (kk in row) else np.nan) for kk in valid_fields ])
+            print("len(row)", len(row))
+            headerlist.append(row)
+        except Exception as ex:
+#             raise ex
+            warn('header extraction failed on #\t%s\t%s\t%s' % (nn, ff, ex))
+    # in the end, print the rest:
+    df_hl = pd.DataFrame( headerlist, columns=final_columns)
+    df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
+    outfh.flush()
+
+print("DONE")
@@ -0,0 +1,798 @@
+
+# coding: utf-8
+
+import numpy as np
+import pandas as pd
+import os
+from functools import partial
+from itertools import chain
+
+def entropy(x):
+    f = x.value_counts()
+#     f.loc["nan"] = x.isnull().sum()
+    return (f*f.map(np.log2)).sum()
+
+
+def select_text_fields(df_allheaders):
+    text_fields = df_allheaders.dtypes.map(lambda x: x is pd.np.dtype(object))
+    text_fields = text_fields[text_fields].index.tolist()
+    len(text_fields)
+    text_fields = (~df_allheaders[text_fields].isnull()).mean() > 0.05
+
+    text_fields = text_fields[text_fields].index.tolist()
+    remove_list = []
+    for tt in text_fields:
+        numunique = len(df_allheaders[tt].unique())
+        entr = entropy(df_allheaders[tt])
+        if entr<1000 | (numunique == 1) | (numunique > 0.75*df_allheaders.shape[1]):
+            remove_list.append(tt)
+    
+    for tt in remove_list:
+        text_fields.remove(tt)
+
+    len(text_fields)
+    return text_fields
+
+
+def get_good_numeric_fields(df_allheaders, thr_stderr = 1e-6):
+    stderr = df_allheaders.std()/df_allheaders.mean()
+    field_list = stderr[stderr> thr_stderr].index.tolist()
+    return field_list
+
+
+def get_index_from_int_tuple(x, ind):
+    if type(x) is str:
+        x = eval(x)
+        return int(float(x[ind]))
+    else:
+        return x
+
+
+def clean_up_field_list(field_list, 
+     prefices_remove = ["date", "accession", "number", 
+         "Filename",
+         "ImageLaterality",
+         "GantryID",
+         #"0_ViewCodeSequence_CodeMeaning",
+         "ViewCodeSequence_CodeMeaning",
+         "ViewModifierCodeSequence_CodeValue",
+         "EthnicGroup",
+         "BodyPartExamined",
+         "LossyImageCompression",
+         "DeidentificationMethodCodeSequence",
+         "UID",
+         'EntranceDoseInmGy',
+         'ProcedureCodeSequence_CodeMeaning',
+         'CommentsOnRadiationDose',
+         'DetectorID',
+         'SeriesDescription', # potentially informative but too many values
+         'SoftwareVersions',
+         'PatientAge',
+         ],
+     fields_remove = [ 'PatientID', 'PatientName', "BitsStored",
+         'AcquisitionTime', 
+         'AdmittingTime', 
+         'ScheduledStudyStartTime',
+         'InstanceCreationTime',
+         'PerformedProcedureStepStartTime',
+         'PregnancyStatus',
+         'StudyArrivalTime',
+         'StudyCompletionTime',
+         'StudyTime',
+         'TimeOfLastCalibration',
+         'TimeOfLastDetectorCalibration',
+         'TimeOfSecondaryCapture',]):
+
+    prefices_remove = [x.lower() for x in prefices_remove]
+
+    for ff in field_list:
+        for pp in prefices_remove:
+            if pp in ff.lower():
+                if ff not in fields_remove:
+                    fields_remove.append(ff)
+
+    for ff in fields_remove:
+        try:
+            field_list.remove(ff)
+        except ValueError as ve:
+            print(ff, ve)
+    return field_list
+
+
+def make_lowercase_text_fields(df_allheaders):
+    """## make all text fields lowercase 
+    (except accession and file name)"""
+    for cname in df_allheaders.columns[1:]:
+        cc = df_allheaders[cname]
+        if cc.dtype is np.dtype(object):
+            df_allheaders[cname] = cc.str.lower()
+    return df_allheaders
+
+
+def format_PixelSpacing(x):
+    if type(x) is float:
+        return x
+    else:
+        xstr = x.lstrip("(").rstrip(")").replace("'", "").replace(" ","").split(",")
+        return np.unique(tuple([float(y) for y in xstr]))[0]
+
+def parse_float(x):
+    x = str(x).replace("'","").replace("b","").replace("None","nan")
+    if x == "":
+        x = np.nan
+    return x
+
+def parse_float_tuples(x, to_int=False):
+    x = list(str(x))
+    for nn,ss in enumerate(x):
+        if not ss.isdigit() and ss!='.':
+            x[nn] = ';'
+    x = "".join(x).split(';')
+    if to_int:
+        x = tuple([int(float(dd)) for dd in x if len(dd)])
+    else:
+        x = tuple([float(dd) for dd in x if len(dd)])
+    if type(x) is not tuple:
+        raise TypeError("returned non-list: {}".format(str(x)))
+    return x
+
+def parse_float_tuples_prod(x):
+    if x not in (None, np.nan) and len(x)>0:
+        x = str(x)
+        assert type(x) is str
+        x = parse_float_tuples(x)
+        if type(x) is not tuple:
+            raise TypeError("returned non-list: {} of type {}".format(str(x), type(x)))
+        try:
+            x = np.prod(x)
+        except TypeError as ee:
+            print('"%s"' % x)
+            raise ee
+    else:
+        x = np.nan
+    return x
+
+def parse_int_tuples_median(x):
+    x = parse_float_tuples(x)
+    x = np.median(x)
+    return x
+"""
+def parse_float_tuples(x):
+    x = eval(x) if type(x) is str else x
+    if type(x) in [tuple, list]:
+        x = tuple([float(y) for y in x])
+    return x
+"""
+
+def parse_str_tuples(x):
+    try:
+        x = eval(x) if type(x) is str else x
+    except:
+        x = tuple(x.split(" ")) if type(x) is str else x
+    return x
+#############################33
+def extract_list_text_field(df_allheaders, colprefix = "ViewModifierCodeSequence_CodeMeaning"):
+    allcols = df_allheaders.columns
+    cols = allcols[np.asarray(allcols.map(lambda x: colprefix in x and x!=colprefix), dtype=bool)]
+
+    ViewModifierCodeSequence_CodeMeaning = set()
+    for cc in cols:
+        ViewModifierCodeSequence_CodeMeaning |= set(df_allheaders[cc].dropna().unique())
+
+    for vv in (True, False):
+        if (vv in ViewModifierCodeSequence_CodeMeaning):
+            ViewModifierCodeSequence_CodeMeaning.remove(vv)
+        
+    ViewModifierCodeSequence_CodeMeaning = dict(zip(
+            ViewModifierCodeSequence_CodeMeaning,
+           [None]*len(ViewModifierCodeSequence_CodeMeaning)))
+    
+    for kk in ViewModifierCodeSequence_CodeMeaning.keys():
+        ViewModifierCodeSequence_CodeMeaning[kk] = df_allheaders[cols[0]].copy()
+        ViewModifierCodeSequence_CodeMeaning[kk][:] = False
+        ViewModifierCodeSequence_CodeMeaning[kk] = \
+            ViewModifierCodeSequence_CodeMeaning[kk].astype(bool)
+        for cc in cols:
+            ViewModifierCodeSequence_CodeMeaning[kk] |= df_allheaders[cc].map(lambda x: kk in x if type(x) is str else False) 
+
+
+    ViewModifierCodeSequence_CodeMeaning = pd.DataFrame(ViewModifierCodeSequence_CodeMeaning)
+    ViewModifierCodeSequence_CodeMeaning.columns = \
+        ViewModifierCodeSequence_CodeMeaning.columns.map(lambda x: colprefix + "_" + x.replace(" ",""))
+    
+    for cc in cols:
+        df_allheaders.drop(cc, axis=1, inplace=True)
+    df_allheaders = pd.concat([df_allheaders, ViewModifierCodeSequence_CodeMeaning], axis=1)
+    return df_allheaders
+
+#############################33
+def normalize_fields(df_allheaders):
+    # ## Clean up
+    # ### PixelSpacing
+    if "PatientAge" in df_allheaders.columns:
+        df_allheaders.PatientAge = df_allheaders.PatientAge.map(lambda x: int(x.lower().rstrip('y')))
+    if "DetectorActiveDimensions" in  df_allheaders.columns:
+        df_allheaders.DetectorActiveDimensions = df_allheaders.DetectorActiveDimensions.map(parse_float_tuples_prod)
+        #df_allheaders.DetectorActiveDimensions = list(map(parse_float_tuples_prod,
+        #                                df_allheaders.DetectorActiveDimensions.tolist()))
+
+    if "PixelSpacing" in  df_allheaders.columns:
+        df_allheaders.PixelSpacing = df_allheaders["PixelSpacing"].map(format_PixelSpacing)
+    if "ImagerPixelSpacing" in df_allheaders.columns:
+        df_allheaders.ImagerPixelSpacing = df_allheaders["ImagerPixelSpacing"].map(format_PixelSpacing)
+    if "ModalitiesInStudy" in df_allheaders.columns:
+        df_allheaders["ModalitiesInStudy"] = df_allheaders["ModalitiesInStudy"].map(lambda x: "mg" in str(x))
+    if "HalfValueLayer" in df_allheaders.columns:
+        df_allheaders["HalfValueLayer"] = df_allheaders["HalfValueLayer"].map(lambda x: x if type(x) is float else float(str(x).replace('b','').replace("'", '')))
+    
+
+
+    # ### FieldOfViewDimensions
+    # computing area and filling in the gaps with the mode **worsens** the FNR
+
+    # df_allheaders['FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].map(lambda x: np.prod([int(y) for y in eval(x)]) if type(x) is str else x)
+    # df_allheaders.loc[df_allheaders['FieldOfViewDimensions'].isnull(), 'FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].value_counts().argmax()
+
+
+    # df_allheaders["PartialView"].map(lambda x: type(x)).value_counts()
+    if "ViewPosition" in df_allheaders.columns:
+        df_allheaders["ViewPosition"] = df_allheaders["ViewPosition"].map(lambda x: x in ['cc', 'mlo'])
+
+    df_allheaders = extract_list_text_field(df_allheaders, 
+        colprefix = "ViewModifierCodeSequence_CodeMeaning")
+
+    #df_allheaders = extract_list_text_field(df_allheaders, 
+    #    colprefix = "ViewModifierCodeSequence_CodeMeaning")
+
+    # ### BreastImplantPresent
+    # #### clean up
+    if "BreastImplantPresent" in df_allheaders.columns:
+        # BreastImplantPresent = pd.Series([np.nan]*df_allheaders.shape[0])
+        #BreastImplantPresent = pd.Series([False]*df_allheaders.shape[0])
+        #BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)] = True
+        BreastImplantPresent = df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)
+        # BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "no" in x)] = False
+        df_allheaders['BreastImplantPresent'] = BreastImplantPresent
+        del BreastImplantPresent
+    if "PartialView" in df_allheaders:
+        df_allheaders["PartialView"] = df_allheaders["PartialView"].map(lambda x : "yes" in x if type(x) is str else False)
+
+    for kk in ["WindowWidth", "WindowCenter"]:
+        if kk in df_allheaders.columns:
+            df_allheaders[kk] = df_allheaders[kk].map(parse_int_tuples_median)
+    
+    if "PatientOrientation" in df_allheaders.columns:
+        df_allheaders.PatientOrientation = df_allheaders.PatientOrientation.map(parse_str_tuples)
+    if "DetectorElementPhysicalSize" in df_allheaders.columns:
+        df_allheaders["DetectorElementPhysicalSize"] = df_allheaders.DetectorElementPhysicalSize.map(parse_float_tuples)
+    # ### Grid
+    # df_allheaders["Grid"].value_counts()
+    if "Grid" in df_allheaders.columns:
+        df_allheaders["Grid"] = (df_allheaders["Grid"]
+                             .map(str)
+                             .map(lambda x: x.replace('(','')
+                                             .replace(')','')
+                                             .replace("'","")
+                                             .replace(',','')
+                                             .replace("parrallel", "parallel")))
+
+        df_allheaders.loc[df_allheaders["Grid"] == "('reciprocating', 'parrallel')", "Grid"] = "('reciprocating', 'parallel')"
+        df_allheaders["Grid"].value_counts()
+    # df_allheaders.PixelSpacing = df_allheaders.PixelSpacing.astype(str)
+    # df_allheaders.PixelSpacing.value_counts()
+    if "FieldOfViewOrigin" in df_allheaders.columns:
+        df_allheaders["FieldOfViewOrigin_x"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 0))
+        df_allheaders["FieldOfViewOrigin_y"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 1))
+        df_allheaders.drop("FieldOfViewOrigin", axis=1, inplace=True)
+
+    #informative_cols.remove("FieldOfViewOrigin")
+    #informative_cols.append("FieldOfViewOrigin_x")
+    #informative_cols.append("FieldOfViewOrigin_y")
+    if "FocalSpots" in df_allheaders.columns: 
+        df_allheaders.loc[df_allheaders["FocalSpots"].isnull(), "FocalSpots"] = df_allheaders["FocalSpots"].value_counts().argmax()
+    for kk in ["PixelSpacing", "EstimatedRadiographicMagnificationFactor", "XRayTubeCurrent", "DistanceSourceToPatient"]:
+    #    print(kk)
+        if kk in df_allheaders.columns:
+            df_allheaders.loc[df_allheaders[kk].isnull(), kk] = df_allheaders[kk].median()
+    if "ImageType" in df_allheaders.columns:
+        keywords = set(chain(*(df_allheaders.ImageType.map(lambda x: parse_str_tuples(x)).tolist())))
+        keywords.remove("")
+        for kk in  keywords:
+            key = "ImageType"+"_"+kk
+            df_allheaders[key] = df_allheaders.ImageType.map(lambda x: kk in x)
+        df_allheaders.drop("ImageType", axis=1, inplace=True)
+
+    return df_allheaders
+
+
+def move_digits_back(allcolumns):
+    allcolumns = list(allcolumns)
+    for nn, x in enumerate(allcolumns):
+        if x[0] in set(list('0123456789')):
+            x = "_".join(x.split("_")[1:] + x.split("_")[:1])
+            allcolumns[nn] = x
+    return allcolumns
+
+def get_features(df_allheaders, thr_stderr = 1e-6):
+    # df_allheaders.columns = move_digits_back(df_allheaders.columns)
+
+    df_allheaders = normalize_fields(df_allheaders.copy())
+    text_fields = select_text_fields(df_allheaders)
+    # df_allheaders[text_fields].apply(entropy).hist()
+
+    if  thr_stderr >0:
+        field_list = get_good_numeric_fields(df_allheaders,thr_stderr=thr_stderr)
+    field_list = list(set(clean_up_field_list(field_list + text_fields)))
+
+    df_allheaders = make_lowercase_text_fields(df_allheaders)
+
+    # pd.crosstab(df_allheaders['0_ViewCodeSequence_CodeMeaning'], df_allheaders['ViewPosition'])
+    # informative_cols = ['Filename', 'AccessionNumber','BreastImplantPresent','DistanceSourceToPatient','EstimatedRadiographicMagnificationFactor',
+    #                  'FocalSpots','Grid','PixelSpacing','XRayTubeCurrent', 'ViewPosition', 'PartialView']
+
+    informative_cols = ['Filename', 'AccessionNumber'] + field_list
+
+    feature_columns = informative_cols[2:]
+
+    noncategorical = ['ContentTime',
+                     'FieldOfViewOrigin_x',
+                     'FieldOfViewOrigin_y',
+                     'HalfValueLayer',
+                     'WindowWidth',
+                     'CompressionForce',
+                    'DetectorActiveDimensions',
+                    'RelativeXRayExposure',
+                    'ExposureTime',
+                    'Exposure',
+                    'BodyPartThickness',
+                    'FieldOfViewOrigin_y',
+                    'CollimatorLowerHorizontalEdge',
+                    'WindowCenter',
+                    'FieldOfViewRotation',
+                    'KVP',
+                    'DistanceSourceToDetector',
+                    'DistanceSourceToEntrance',
+                    'CollimatorLeftVerticalEdge',
+                    'DetectorTemperature',
+                    'HighBit'] 
+    categorical = ['Manufacturer',
+                    'ManufacturerModelName',
+                    'Grid_htc',
+                    'ViewModifierCodeSequence_CodeMeaning',
+                    'ViewModifierCodeSequence_CodeMeaning']
+
+    noncategorical = list(set(feature_columns) & set(noncategorical))
+    potentially_categorical = (set(feature_columns) - set(noncategorical))
+    potentially_categorical |= set(categorical) & set(df_allheaders.columns)
+    potentially_categorical = list(potentially_categorical)
+    print("potentially_categorical", len(potentially_categorical))
+    print("non_categorical", len(noncategorical))
+    for cc in noncategorical:
+        if str(df_allheaders[cc].dtype) == 'object':
+            df_allheaders[cc] = df_allheaders[cc].map(parse_float).astype(float)
+    if len(potentially_categorical)>0:
+        df_allheaders[potentially_categorical] = df_allheaders[potentially_categorical].fillna('unknown')
+        features_onehot = pd.get_dummies(df_allheaders[potentially_categorical], 
+                            drop_first=True, prefix_sep='=')
+        features_onehot = pd.concat([features_onehot, df_allheaders[noncategorical]], axis=1) 
+    else:
+        print("no features to binarise!")
+        features_onehot = df_allheaders[non_categorical].copy()
+
+    #features_onehot = pd.concat([df_allheaders.Filename, features_onehot],axis=1,).set_index("Filename")
+
+    features_onehot.shape, features_onehot.dropna().shape
+
+    # ### Map DICOM  file name to PNG file name (remove directories)
+    #features_onehot.index = features_onehot.index.map(lambda x: "_".join(x.split("/")[-4:]).replace(".dcm", ".png")).tolist()
+    for cc in features_onehot.columns[features_onehot.isnull().any()]:
+        print("filling in with median:\t%s" % cc)
+        features_onehot.loc[features_onehot[cc].isnull(),cc] = \
+                features_onehot[cc].median()
+    features_onehot = features_onehot.loc[:,~features_onehot.isnull().any()]
+
+    onehotcols = np.asarray(features_onehot.columns[features_onehot.dtypes.map(lambda x : x is pd.np.dtype("uint8"))].tolist())
+    thr_frac = 0.01
+    bad_feature_cols = onehotcols[(features_onehot[onehotcols].sum(0) < 5) |
+                                  (features_onehot[onehotcols].mean(0) < thr_frac) |
+                                  (features_onehot[onehotcols].mean(0) > (1-thr_frac))]
+    len(bad_feature_cols)
+    features_onehot.drop(bad_feature_cols, axis=1, inplace=True)
+    if "FocalSpots" in features_onehot:
+        features_onehot.loc[features_onehot["FocalSpots"].isnull(), "FocalSpots"] = \
+                features_onehot["FocalSpots"].value_counts().argmax()
+
+    return features_onehot
+
+
+#############################
+if __name__ == '__main__':
+    PREFIX="allfeatures"
+
+    # !sudo pip3 install dicom 
+    # # read a table of DICOM headers
+    filelist_fn = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/filelist_nonscreening_4000_seed42.csv'
+    outpath = os.path.join(os.path.dirname(filelist_fn), "dicom_headers_all_fields_" + os.path.basename(filelist_fn))
+    print(outpath)
+    df_allheaders = pd.read_csv(outpath, index_col=0)
+    features_onehot = get_features(df_allheaders)
+
+    # ## Read labels
+    fn_man_labels = "/data/dlituiev/tables/cleaned_manual_labels_valset_4000.txt"
+    df = pd.read_table(fn_man_labels, index_col=0)
+    df.index = df.index.map(lambda x : x.split("/")[-1])
+
+    # process labels
+    df["special_view"] = df["regular_view"].map(lambda x: not x)
+
+
+    dfm = pd.merge(df[["special_view"]], features_onehot, how='left', left_index=True, right_index=True)
+    dfm.shape
+
+    import seaborn as sns
+    import matplotlib.pyplot as plt
+    from statsmodels.graphics.mosaicplot import mosaic
+    plt.matplotlib.rcParams["hatch.color"] = [0.7]*3
+
+    dfm.var()
+    dfm.isnull().sum()
+    dfm.plot(x='special_view', y='XRayTubeCurrent', kind='scatter', alpha=0.05)
+    dfm.plot(x='special_view', y='DistanceSourceToPatient', kind='scatter', alpha=0.05)
+    dfm["special_view"].isnull().sum()
+
+
+    target = dfm["special_view"]
+    features = dfm.drop("special_view", axis=1)
+
+
+    from sklearn.utils import shuffle
+    # for building and visualizing the decision tree
+    from sklearn.naive_bayes import GaussianNB, BernoulliNB
+    # from sklearn.svm import SVC
+    from sklearn.tree import DecisionTreeClassifier
+    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
+    # visualization
+    from vis_tree import visualize_tree
+    from sklearn.model_selection import train_test_split, cross_val_score
+    from sklearn.metrics import (accuracy_score, auc, confusion_matrix, f1_score,
+                                 precision_score, roc_curve, precision_recall_curve)
+
+
+
+
+    y_dev,  y_val, X_dev, X_val = train_test_split(target, features, random_state=0, test_size=1/6)
+
+    y_tr,  y_ts, X_tr, X_ts = train_test_split(y_dev, X_dev, random_state=0, test_size=1/5)
+
+
+
+
+    # dtree = DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, criterion="entropy")
+    # dtree = RandomForestClassifier(min_samples_split=10, min_samples_leaf=5)
+    # dtree = AdaBoostClassifier(base_estimator=dtree, n_estimators=60, learning_rate=0.01)
+    # dtree = AdaBoostClassifier(base_estimator=GaussianNB(), n_estimators=50, learning_rate=0.01)
+
+
+
+
+    dtree = GradientBoostingClassifier(max_depth=8, n_estimators=40, learning_rate=0.05, min_samples_leaf=12)
+    modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
+
+
+
+
+    dtree.fit(X_tr, y_tr)
+    pred_y_ts = dtree.predict(X_ts)
+    pred_yscore_ts = dtree.predict_proba(X_ts)
+
+
+
+
+    get_ipython().magic('pinfo auc')
+
+
+
+
+    pr_, rec_, thresholds = precision_recall_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
+    # auc_pr = auc(pr_, rec_)
+
+    plt.plot(pr_, rec_)
+    plt.xlabel('Precision')
+    plt.ylabel('Recall')
+    # plt.title('auPRC = {0:.2f}%'.format(auc_pr))
+    plt.xlim([0,1])
+    plt.ylim([0,1])
+    plt.axis('equal')
+    plt.axis('square')
+
+    print("%.2f" % (100*auc_))
+    frmt = 'png'
+    plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
+
+
+
+
+    fpr_, tpr_, thresholds = roc_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
+    auc_ = auc(fnr_, tpr_)
+
+    plt.plot(fpr_, tpr_)
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('AUC = {0:.2f}%'.format(auc_))
+    plt.axis('equal')
+    plt.axis('square')
+
+    print("%.2f" % (100*auc_))
+    frmt = 'png'
+    plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
+
+
+
+
+    # pd.DataFrame(dict(FNR=fnr_, TPR=tpr_, threshold=thresholds))
+    features.plot(x="EstimatedRadiographicMagnificationFactor", y="PixelSpacing", kind='scatter')
+
+
+
+
+    fig,ax = plt.subplots(1, figsize=(6,14))
+    feat_imp = pd.Series(dtree.feature_importances_, index=features.columns)
+    feat_imp = feat_imp[feat_imp>0.0].sort_values()[::-1]
+    feat_imp[::-1].plot(kind='barh', ax=ax)
+    print(feat_imp)
+    # plt.xlim([0,0.5])
+    # plt.tight_layout()
+    frmt = 'png'
+    plt.savefig("{}_{}_feature_importances.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
+
+
+
+
+
+
+
+
+
+    len(thresholds)
+
+
+
+
+    # pd.DataFrame(dict(
+    #     FNR=fnr_,
+    #     TPR=tpr_,
+    #     threshold = thresholds))
+
+
+
+
+    df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
+    df_confusion
+    confusion_matrix(y_ts, pred_y_ts)
+    cm = confusion_matrix(y_ts, pred_y_ts)
+    cm[1,0]/cm[1,:].sum()
+    def fnr(dtree, X_val, y_val, thr = None):
+        if not thr:
+            pred_y_val = dtree.predict(X_val)
+        else:
+            pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
+    #     df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
+    #                                pd.Series(pred_y_val, name="predicted"))
+    #     out = df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
+        
+        cm = confusion_matrix(y_val, pred_y_val)
+        out = cm[1,0]/cm[1,:].sum()
+        return out
+
+
+
+
+    def fpr(dtree, X_val, y_val, thr = None):
+        if not thr:
+            pred_y_val = dtree.predict(X_val)
+        else:
+            pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
+    #     df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
+    #                                pd.Series(pred_y_val, name="predicted"))
+    #     out = df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
+        
+        
+        cm = confusion_matrix(y_val, pred_y_val)
+        if cm[0,:].sum() !=0:
+            out = cm[0,1]/cm[0,:].sum()
+        else:
+            out = 0.0
+        return out
+
+
+
+
+
+
+
+
+
+    THR = 0.15
+
+
+    #          True | False
+    #     True   TP |  FN
+    #     False  FP |  TN
+    # 
+    # 
+    #     FPR = FP / (FP + TN)
+    # 
+
+
+
+    pred_y_ts = dtree.predict_proba(X_ts)[:,1] > THR
+    df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
+    print(df_confusion.to_csv(sep='|'))
+
+
+
+
+    THR = 0.05
+
+    modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
+    cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
+    cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
+
+    tmpstr = """model: {}
+    threshold = {}
+    + on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
+    + in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
+        modelname, THR, 
+        100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
+        100*cv_fnr.mean(), 100*cv_fpr.mean())
+    print(tmpstr)
+
+
+
+
+    THR = 0.5
+    modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
+    cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
+    cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
+
+    tmpstr = """model: {}
+    threshold = {}
+    + on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
+    + in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
+        modelname, THR, 
+        100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
+        100*cv_fnr.mean(), 100*cv_fpr.mean())
+    print(tmpstr)
+
+
+
+
+    6/72
+
+
+    # ## fnr
+    # 0.1443 -- AdaBoostClassifier(50, lr=0.1) with:
+    # 
+    # 
+    #     DecisionTreeClassifier(max_depth=7, min_samples_leaf=5, criterion="entropy")
+    #     GaussianNB()
+    # 
+    # 0.1134 -- AdaBoostClassifier(50, lr=0.01) with:
+    #     GaussianNB()
+
+
+
+    accuracy_score(y_true=y_val, y_pred=pred_y_val)
+
+
+
+
+    f1_score(y_true=y_val, y_pred=pred_y_val)
+
+
+
+
+
+
+
+
+
+    confusion_matrix(y_true=y_val, y_pred=pred_y_val)
+
+
+
+
+
+    df_confusion = pd.crosstab(pd.Series(y_val.as_matrix(), name="observed"), 
+                               pd.Series(pred_yscore_dev[:,1]>0.15, name="predicted"))
+    df_confusion
+
+
+
+
+    df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
+
+
+
+
+    df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
+
+
+
+
+    109/(385+109)
+
+
+    # ## Misclassified: examples and comments
+
+
+
+    # pred_false = (pd.Series(pred_y_val, name="predicted")==False)
+    pred_false = (pd.Series(pred_yscore_dev[:,1]<0.15, name="predicted")==False)
+    false_negatives = (pd.Series(y_val.as_matrix(), name="observed")) & pred_false
+    false_negatives.index=y_val.index
+    false_negatives.shape, df.shape
+    # y_val[false_negatives.tolist()].shape 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    xstr = """1805162996_1.2.840.113654.2.70.1.75424722723272471565664976911416714890_2_37.png -- implant?
+    1433463766_1.2.840.113654.2.70.1.243422935316700791950696878743366703411_6_6.png -- male?
+    3395322213_1.2.840.113654.2.70.1.161905211577383187509354224390811944382_1161_7.png -- overexposed with scale grid
+    1383662805_1.2.840.113654.2.70.1.194667288082835549565211946781626641146_1_88.png -- mag? bars in the image
+    5717508670_1.2.840.113654.2.70.1.135196805563780165444562848954663016070_2_6.png -- spot
+    1582554801_1.2.840.113654.2.70.1.202883517655342643705007475928329105895_1_1.png -- strange shape; plate
+    3248534628_1.2.840.113654.2.70.1.153327658320065917717726871735320153117_14_8.png -- RLMID, implant
+    1050998385_1.2.840.113654.2.70.1.294672228525412928579179278566440354700_168_12.png -- RMLO, underexposed, plate
+    2431514667_1.2.840.113654.2.70.1.132697486450403983700631264913146412468_1_1.png -- regular CC
+    2836025574_1.2.840.113654.2.70.1.94728406891527814842052605970255602447_31728_4.png  -- regular CC, wire?
+    2774547752_1.2.840.113654.2.70.1.152335331945150793610356395498084601027_47428_6.png  -- poor exposure?
+    6784971236_1.2.840.113654.2.70.1.276140387730485551768768734852859745761_21705_2.png  -- regular CC
+    6120027884_1.2.840.113654.2.70.1.202389441802705593488291262945242015864_28128_3.png -- spot
+    2127109953_1.2.840.113654.2.70.1.136443797025605972119376095795980286524_5_26.png  -- RML, scar
+    5015120217_1.2.840.113654.2.70.1.8576402180164318136049174781190805706_19615_3.png -- regular MLO, underexposure
+    2915273528_1.2.840.113654.2.70.1.50904067248781976561131370015339684052_3_51.png -- RLM
+    2859796079_1.2.840.113654.2.70.1.248757700026158935826319533755178408586_3_51.png -- LMLO, scar""".split("\n")
+
+
+
+
+    df_misclassified_comments = pd.DataFrame([x.split(" -- ") for x in xstr], columns=["Filename", "comment"]).applymap(lambda x: x.rstrip().lstrip()).set_index("Filename")["comment"]
+    df_misclassified_comments
+
+
+
+
+    df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
+
+
+
+
+    df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
+
+
+
+
+    X_val.columns
+
+
+
+
+    # X_val[false_negatives][['ViewPosition_ccid', 'ViewPosition_lm', 'ViewPosition_lmid',
+    #        'ViewPosition_ml', 'ViewPosition_mlo', 'ViewPosition_mloid',
+    #        'ViewPosition_xccl', "FieldOfViewDimensions_('145', '105')"]]
+
+    X_val[false_negatives][['ViewPosition', 
+                           'ViewModifierCodeSequence']]
+
@@ -0,0 +1,97 @@
+
+# coding: utf-8
+
+#cell#
+
+import pandas as pd
+import sys
+from header_cleaner import get_features, normalize_fields, parse_float_tuples, parse_float
+
+#cell#
+fn_features = "../tables/df_all_mammos_dicom_headers_selected.tab.gz"
+outfn = "../tables/df_all_mammos_dicom_headers_selected_norm.tab"
+
+dffeatures = pd.read_table(fn_features, index_col="filename")
+
+#cell#
+mask_nonnumeric = ~dffeatures["ContentTime"].map(lambda x: isinstance(x, float) | isinstance(x, int))
+dffeatures.loc[mask_nonnumeric, "ContentTime"] = dffeatures["ContentTime"][mask_nonnumeric].map(lambda x: float(x.replace(':','').replace('--',"30")))
+
+#cell#
+print("shape", dffeatures.shape)
+
+#cell#
+normalize_fun = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning":
+                lambda x: str(x).lower(),
+                "0_ViewCodeSequence_CodeValue": lambda x: str(x),
+                "Grid": lambda x: str(x).replace("'","")
+                                       .replace("(","").replace(")","")
+                                       .replace(",","").replace("/"," ")
+                                       .replace('PARRALLEL',"PARALLEL")
+                                       .lower(),
+                "HighBit": lambda x: str(int(x)) if (isinstance(x, float) and x*1==x) else str(x),
+                "WindowCenter": lambda x: np.median(parse_float_tuples(x)),
+                "FieldOfViewOrigin":parse_float_tuples,
+                "EstimatedRadiographicMagnificationFactor": lambda x: x,
+                "ContentTime": lambda x: x,
+                "FieldOfViewRotation": lambda x: float(parse_float(x)),
+                "KVP": lambda x: float(parse_float(x)),
+                 "ShutterLowerHorizontalEdge":  lambda x: float(parse_float(x)),
+                 "ShutterRightVerticalEdge":   lambda x: float(parse_float(x)),
+                 "XRayTubeCurrentInuA": lambda x: float(parse_float(x)),
+                 "RelativeXRayExposure": lambda x: float(parse_float(x)),
+                 "ManufacturerModelName": lambda x: str(x).lower().replace('"',''),
+                 "Manufacturer": lambda x: str(x).lower().replace('"','').replace(',', '').replace(" inc", "").rstrip('.'),
+                 "BodyPartThickness":lambda x: float(parse_float(x)),
+                 "CollimatorLeftVerticalEdge": lambda x: float(parse_float(x)),
+                 "CollimatorLowerHorizontalEdge": lambda x: float(parse_float(x)),
+                 "DetectorActiveDimensions" : lambda x: parse_float_tuples(x.replace("\\", ", ") if isinstance(x, str) else x),
+                 "ExposureTime": lambda x: x,
+                 "ExposuresOnDetectorSinceLastCalibration": lambda x: x,
+                 "ExposuresOnDetectorSinceManufactured": lambda x: x,
+                 "DistanceSourceToEntrance":  lambda x: x,
+                 "DetectorTemperature":lambda x: float(parse_float(x)),
+                 "DistanceSourceToDetector":  lambda x: x,
+                 
+}
+
+dtypes = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning": str,
+                "0_ViewCodeSequence_CodeValue": str,
+                "Grid": str,
+                "HighBit": str, # int
+                "WindowCenter": int,
+                "FieldOfViewOrigin": 'O',
+                "EstimatedRadiographicMagnificationFactor": float,
+                "ContentTime": float, #NaN
+                "FieldOfViewRotation": float,
+                "KVP": float,
+                 "ShutterLowerHorizontalEdge": float,
+                 "ShutterRightVerticalEdge": float,
+                 "XRayTubeCurrentInuA": float,
+                 "RelativeXRayExposure": float,
+                 "ManufacturerModelName": str,
+                 "Manufacturer": str,
+                 "BodyPartThickness": float,
+                 "CollimatorLeftVerticalEdge": float,
+                 "CollimatorLowerHorizontalEdge": float,
+                 "DetectorActiveDimensions" : 'O',
+                 "ExposureTime": float,
+                 "ExposuresOnDetectorSinceLastCalibration": float, # NaNs
+                 "ExposuresOnDetectorSinceManufactured": float, # NaNs
+                 "DistanceSourceToEntrance": float,
+                 "DetectorTemperature": float,
+                 "DistanceSourceToDetector": float,
+                 
+}
+
+#cell#
+
+set(dffeatures.columns) - set(normalize_fun.keys())
+
+#cell#
+
+for kk, vv in dffeatures.items():
+    print(kk)
+    dffeatures.loc[:,kk] = vv.map(normalize_fun[kk]).astype(dtypes[kk])
+
+dffeatures.to_csv(outfn, sep='\t',  compression='gzip')
@@ -0,0 +1,48 @@
+ReduceLROnPlateau:
+  cooldown: 32
+  epsilon: 0.001
+  factor: 0.5
+  min_lr: 1.0e-08
+  mode: auto
+  monitor: val_loss
+  patience: 32
+  verbose: 0
+base_trainable: true
+batch_size: 256
+class_mode: binary
+class_weights: null
+classes:
+- normal
+- special
+contrast: null
+data_augmentation: true
+data_train: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train
+data_val: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test
+dropout: 0.5
+fill_mode: reflect
+final_activation: sigmoid
+height_shift_range: 0.125
+horizontal_flip: true
+init_epoch: 0
+loss_weights: null
+lr: 0.0001
+n_classes: 1
+nb_epoch: 500
+ndense: 0
+oversampling: false
+pretrained: true
+rotation_range: 15
+samplewise_center: false
+seed: 2
+target_side: 99
+target_size:
+- 99
+- 99
+truncate_quantile: null
+vertical_flip: false
+weightfile: null
+width_shift_range: 0.125
+zoom_range:
+- 0.8
+- 1.2
+ztransform: false
@@ -0,0 +1 @@
+../inception_short.py
@@ -0,0 +1,185 @@
+
+# coding: utf-8
+import sys
+import pandas as pd
+sys.path.append('../..')
+
+from inception_short import get_model, get_num_files, get_class_weights
+from keras.optimizers import Adam
+from image import ImageDataGenerator
+# from keras.preprocessing.image import ImageDataGenerator
+from keras.models import load_model
+from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
+from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
+from shutil import copy2
+from functools import partial
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+import os
+import yaml
+import numpy as np
+import keras
+from hashlib import md5
+os.environ["PYTHONHASHSEED"]='0'
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
+
+if os.environ["CUDA_VISIBLE_DEVICES"] == '':
+    os.environ["CUDA_VISIBLE_DEVICES"] = '1'
+
+
+indir = "./"
+
+import yaml
+with open(os.path.join(indir, "checkpoint.info")) as chkpt_fh:
+    prms = AttrDict(yaml.load(chkpt_fh))
+    print("\n".join(["%s\t%s" %(kk,vv) for kk,vv in prms.items()]),)
+
+weightfile = os.environ["WFILE"]
+#weightfile = "model.175-0.068012.hdf5"
+prms['weightfile'] =  weightfile
+prms['weightfile'] = os.path.join(indir, prms['weightfile'])
+prms['weightfile']
+
+
+# In[6]:
+
+
+prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
+print("loss:", prms["loss"])
+
+# CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
+
+SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
+STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
+
+print('='*50)
+print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+print('='*50)
+#########################################
+
+if prms.weightfile:
+    print("LOADING WEIGHTS FROM:\t%s" % prms.weightfile)
+#     model.load_weights(prms.weightfile)
+    model = load_model(prms.weightfile)
+
+
+# In[22]:
+
+
+flowfromdir_params = dict(
+#     color_mode = "grayscale",
+    target_size=prms.target_size,
+    batch_size=prms.batch_size,
+    class_mode=prms.class_mode,
+    classes=prms.classes,
+    seed=prms.seed)
+
+norm_params = dict(
+        #rescale=prms.scaleup,
+        samplewise_center=prms.samplewise_center,
+        samplewise_std_normalization=prms.samplewise_center,
+        featurewise_center=False,
+        featurewise_std_normalization=False,
+        zca_whitening=False,
+        )
+
+
+# In[23]:
+
+
+train_datagen = ImageDataGenerator(**norm_params)
+
+train_datagen.preprocessing_function = lambda x: x[...,::-1,:]#*2**-8
+datagen_train_output = train_datagen.flow_from_directory(
+    prms.data_train,
+    #stratify = prms.oversampling,
+    #sampling_factor=prms.sampling_factor,
+    #oversampling=prms.oversampling,
+    shuffle=False, **flowfromdir_params)
+SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
+STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
+
+##########################################
+def get_predictions(data_dir, 
+                    preprocessing_function = lambda x:x,
+                    model=model):
+    if isinstance(preprocessing_function, str):
+        if preprocessing_function == 'fliplr':
+            preprocessing_function = lambda x: x[...,::-1,:]
+        elif preprocessing_function in ('identity', 'orig'):
+            preprocessing_function = lambda x:x
+        else:
+            raise ValueError('unknown preprocessing_function:\t%s' 
+                             % preprocessing_function)
+
+    val_datagen = ImageDataGenerator(**norm_params)
+    val_datagen.preprocessing_function = preprocessing_function
+    datagen_val_output = val_datagen.flow_from_directory(
+            data_dir,
+            shuffle=False, **flowfromdir_params)
+
+    gen_ = datagen_val_output 
+    yhat = model.predict_generator(gen_,
+                          steps=len(gen_),
+                          verbose=1,)
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":gen_.filenames, "label": gen_.classes})
+    dfres = pd.DataFrame(dfdict)
+    return dfres
+##########################################
+#                HOLDOUT 
+##########################################
+data_holdout = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_val'
+dfres = get_predictions(
+                data_holdout, 
+                preprocessing_function = lambda x:x,
+                model=model)
+dfres.to_csv("predictions_val.csv", index=False)
+##########################################
+preprocessing_function = lambda x: x[...,::-1,:]
+dfres = get_predictions(
+                data_holdout, 
+                preprocessing_function = preprocessing_function,
+                model=model)
+
+dfres.to_csv("predictions_val_fliplr.csv", index=False)
+##########################################
+#                Test 
+##########################################
+
+dfres = get_predictions(
+                prms.data_val,
+                preprocessing_function = lambda x:x,
+                model=model)
+dfres.to_csv("predictions_test.csv", index=False)
+##########################################
+
+preprocessing_function = lambda x: x[...,::-1,:]
+dfres = get_predictions(
+                prms.data_val,
+                preprocessing_function = preprocessing_function,
+                model=model)
+dfres.to_csv("predictions_test_fliplr.csv", index=False)
+##########################################
+#                 TRAIN
+##########################################
+dfres = get_predictions(
+                prms.data_train,
+                preprocessing_function = lambda x:x,
+                model=model)
+dfres.to_csv("predictions_train.csv", index=False)
+##########################################
+preprocessing_function = lambda x: x[...,::-1,:]
+dfres = get_predictions(
+                prms.data_train,
+                preprocessing_function = preprocessing_function,
+                model=model)
+dfres.to_csv("predictions_train_fliplr.csv", index=False)
+
@@ -0,0 +1,239 @@
+from inception_short import get_model, get_num_files, get_class_weights
+from keras.optimizers import Adam
+from image import ImageDataGenerator
+#from keras.preprocessing.image import ImageDataGenerator
+from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
+from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
+from shutil import copy2
+from functools import partial
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+if __name__ == '__main__':
+    import sys
+    import os
+    import yaml
+    import numpy as np
+    import keras
+    from hashlib import md5
+    os.environ["PYTHONHASHSEED"]='0'
+    os.environ['KERAS_BACKEND'] = 'tensorflow'
+    os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
+    os.environ["CUDA_VISIBLE_DEVICES"] = '1'
+
+    prms = AttrDict(
+        dropout=0.5,
+        base_trainable=True,
+        horizontal_flip = True,
+        vertical_flip = False,
+        zoom_range = [0.8, 1.2],
+        rotation_range = 15,
+        fill_mode='reflect',
+        ndense=0,
+        batch_size = 128*2,
+        init_epoch=0,
+        nb_epoch = 500,
+        data_augmentation = True,
+        contrast = None, #0.8,
+        truncate_quantile = None,#0.001,
+        ztransform = False,
+        oversampling = False,
+        #sampling_factor = None, [1, 6, 16, 64, 4],
+        seed=2,
+        width_shift_range = 0.125,
+        height_shift_range = 0.125,
+        class_mode =  'binary', # 'binary', #
+        n_classes = 1,
+        final_activation = 'sigmoid',
+        lr = 1e-4,
+        samplewise_center = False, #True
+        target_side = 99,
+        weightfile = None,
+        pretrained = True,
+        data_train = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train',
+        data_val = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test',
+        classes = ['normal', 'special'],
+        class_weights=None,#[1, 1, 4, 8, 4],
+        loss_weights = None,
+        ReduceLROnPlateau = dict(
+            monitor='val_loss',
+            factor=1/2,
+            patience=32,
+            verbose=0,
+            mode='auto', epsilon=0.001,
+            cooldown=32,
+            min_lr=1e-8,
+            ),
+#        lr_cyclic_schedule = dict(
+#                #lr_init = 1.0e-3,
+#                drop = 2/5,
+#                epochs_drop = 20,
+#                cycle_len = 200.0
+#            )
+        )
+    
+
+    paramhash = md5(str(prms).encode()).hexdigest()
+
+    prms["target_size"] = [ prms.target_side ]*2
+
+    CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+    print("SAVING TO:\t%s" % CHECKPOINT_DIR)
+    # copy the script to the checkpoint directory
+    copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
+    with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
+        yaml.dump(dict(prms), outfh, default_flow_style=False)
+
+    prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
+    print("loss:", prms["loss"])
+
+    CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
+
+    SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
+    STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
+
+    print('='*50)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    #########################################
+    checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
+            save_best_only=False, save_weights_only=False, mode='auto', period=1)
+
+    csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
+    csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
+
+
+    callback_list = [checkpoint, csv_callback]
+
+    if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
+        callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
+
+    elif "lr_cyclic_schedule" in prms:
+        callback_list.append(
+                LearningRateScheduler(
+                    partial(lr_cyclic_schedule,
+                        lr_init = prms.lr,
+                        **prms.lr_cyclic_schedule)
+                                )
+                            )
+    #########################################
+    model = get_model(n_classes=prms.n_classes,
+                      final_activation=prms.final_activation,
+                      ndense=prms.ndense,
+                      dropout=prms.dropout,
+                      base_trainable=prms.base_trainable,
+                      weights = 'imagenet' if prms.pretrained else None,
+                      input_shape = prms.target_size + [3])
+
+
+    #from keras.utils import plot_model
+    #plot_model(model, to_file='model.png')
+
+    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
+                  metrics=['accuracy'],
+                  )
+    #########################################
+    if prms.weightfile:
+        print("loading weights from:\t%s" % prms.weightfile)
+        model.load_weights(prms.weightfile)
+    
+    #########################################
+    print('Using real-time data augmentation.')
+
+    flowfromdir_params = dict(
+        #color_mode = "grayscale",
+        target_size=prms.target_size,
+        batch_size=prms.batch_size,
+        class_mode=prms.class_mode,
+        classes=prms.classes,
+        seed=prms.seed)
+    norm_params = dict(
+            #rescale=prms.scaleup,
+            samplewise_center=prms.samplewise_center,
+            samplewise_std_normalization=prms.samplewise_center,
+            featurewise_center=False,
+            featurewise_std_normalization=False,
+            zca_whitening=False,
+            )
+
+    def _ztransform(x):
+        return (x-np.mean(x)) / np.std(x)
+
+    if 'preprocessing_function' in prms:
+        if prms.preprocessing_function=='ztransform':
+            preprocessing_function = _ztransform
+        elif prms.preprocessing_function=='m1p1':
+            preprocessing_function = lambda x: x/128.0 - 1
+        else:
+            raise ValueError("unknown preprocessing_function")
+    else:
+        preprocessing_function = lambda x: x
+
+
+    if prms.data_augmentation:
+
+        print('Using real-time data augmentation.')
+        train_datagen = ImageDataGenerator(
+            zoom_range=prms.zoom_range,
+            fill_mode=prms.fill_mode,
+            rotation_range = prms.rotation_range,
+            width_shift_range = prms.width_shift_range,
+            height_shift_range = prms.height_shift_range,
+            horizontal_flip=prms.horizontal_flip,
+            vertical_flip=prms.vertical_flip,
+            contrast = prms.contrast,
+            z_transform = prms.ztransform,
+            truncate_quantile = prms.truncate_quantile,
+            #histeq_alpha=prms.histeq_alpha,
+            **norm_params)
+    else:
+        train_datagen = ImageDataGenerator(**norm_params)
+
+    val_datagen = ImageDataGenerator(**norm_params)
+
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        stratify = prms.oversampling,
+        sampling_factor=prms.sampling_factor if (prms.oversampling) else None,
+        oversampling=prms.oversampling,
+        shuffle=True, **flowfromdir_params)
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+
+    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
+    VALIDATION_STEPS = len(datagen_val_output.filenames)/prms['batch_size']
+    print("validation steps", VALIDATION_STEPS)
+    #########################################
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    model.fit_generator(datagen_train_output,
+                          steps_per_epoch=STEPS_PER_EPOCH,
+                          epochs=prms.nb_epoch, verbose=1,
+                          validation_data=datagen_val_output,
+                          validation_steps=VALIDATION_STEPS,
+                          #class_weight='auto',
+                          class_weight=class_weights,
+                          callbacks=callback_list,
+                          initial_epoch=prms.init_epoch)
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+
+    print("""loss\t%.4f
+    accuracy\t%.4f\n""" %
+      tuple(model.evaluate_generator(datagen_val_output,
+                                     steps=VALIDATION_STEPS,
+                                     workers=1,
+                                    pickle_safe=True)))
+
+
+    #model.predict()
@@ -0,0 +1,48 @@
+ReduceLROnPlateau:
+  cooldown: 8
+  epsilon: 0.001
+  factor: 0.5
+  min_lr: 1.0e-12
+  mode: auto
+  monitor: val_loss
+  patience: 64
+  verbose: 0
+base_trainable: false
+batch_size: 16
+class_mode: categorical
+class_weights:
+- 1
+- 1
+classes:
+- normal
+- wire
+data_augmentation: true
+data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
+data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
+dropout: 0.5
+fill_mode: reflect
+final_activation: softmax
+height_shift_range: 0.125
+horizontal_flip: true
+init_epoch: 0
+lr: 0.001
+n_classes: 2
+nb_epoch: 500
+ndense: 0
+oversampling: false
+rescale: 1
+rotation_range: 30
+samplewise_center: false
+seed: 1
+target_side: 299
+target_size:
+- 299
+- 299
+truncate_quantile: null
+vertical_flip: false
+weightfile: null
+width_shift_range: 0.125
+zoom_range:
+- 0.8
+- 1.2
+ztransform: true
@@ -0,0 +1,49 @@
+ReduceLROnPlateau:
+  cooldown: 8
+  epsilon: 0.001
+  factor: 0.5
+  min_lr: 1.0e-12
+  mode: auto
+  monitor: val_loss
+  patience: 64
+  verbose: 0
+base_trainable: false
+batch_size: 16
+class_mode: categorical
+class_weights:
+- 1
+- 1
+classes:
+- normal
+- wire
+data_augmentation: true
+data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
+data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
+data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
+dropout: 0.5
+fill_mode: reflect
+final_activation: softmax
+height_shift_range: 0.125
+horizontal_flip: true
+init_epoch: 0
+lr: 0.001
+n_classes: 2
+nb_epoch: 500
+ndense: 0
+oversampling: false
+rescale: 1
+rotation_range: 30
+samplewise_center: false
+seed: 2
+target_side: 299
+target_size:
+- 299
+- 299
+truncate_quantile: null
+vertical_flip: false
+weightfile: model.147-0.000774.hdf5
+width_shift_range: 0.125
+zoom_range:
+- 0.8
+- 1.2
+ztransform: true
@@ -0,0 +1,315 @@
+import sys
+import pandas as pd
+sys.path.append('../..')
+
+from inception_short import get_model, get_num_files, get_class_weights
+from keras.optimizers import Adam
+from image import ImageDataGenerator
+#from keras.preprocessing.image import ImageDataGenerator
+from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
+from checkpoint_utils import CSVWallClockLogger
+from shutil import copy2
+from losses import acc_0, acc_1, acc_2, acc_3, acc_4
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+import sys
+import os
+import yaml
+import numpy as np
+import keras
+from hashlib import md5
+os.environ["PYTHONHASHSEED"]='0'
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
+os.environ["CUDA_VISIBLE_DEVICES"]="0"
+
+prms = AttrDict(
+    dropout=0.5,
+    base_trainable=False,
+    horizontal_flip = True,
+    vertical_flip = False,
+    zoom_range = [0.8, 1.2],
+    rotation_range = 30,
+    fill_mode='reflect',
+    ndense=0,
+    batch_size = 16,
+    init_epoch=0,
+    nb_epoch = 500,
+    data_augmentation = True,
+    rescale = 1, #2**-8,
+    #contrast = 0.9,
+    truncate_quantile = None,#0.001,
+    ztransform = True,
+    oversampling = False,
+    #sampling_factor = [1, 4],
+    seed=2,
+    width_shift_range = 0.125,
+    height_shift_range = 0.125,
+    class_mode =  'categorical', # 'binary', #
+    n_classes = 2,
+    final_activation = "softmax", # 'sigmoid',
+    lr = 1e-3,
+    samplewise_center = False, #True
+    target_side = 299,
+    #weights = None,
+    weightfile = "model.147-0.000774.hdf5",
+    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
+    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
+    data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
+    classes = ["normal", "wire"],
+    class_weights=[1, 1],
+    ReduceLROnPlateau = dict(
+        monitor='val_loss',
+        factor=1/2,
+        patience=32*2,
+        verbose=0,
+        mode='auto', epsilon=0.001,
+        cooldown=8,
+        min_lr=1e-12,
+        ),
+)
+
+
+paramhash = md5(str(prms).encode()).hexdigest()
+
+prms["target_size"] = [ prms.target_side ]*2
+
+CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
+os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+print("SAVING TO:\t%s" % CHECKPOINT_DIR)
+# copy the script to the checkpoint directory
+copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
+with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
+    yaml.dump(dict(prms), outfh, default_flow_style=False)
+# w_categorical_crossentropy
+CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
+
+SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
+STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
+
+print('='*50)
+print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+print('='*50)
+#########################################
+checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
+        save_best_only=True, save_weights_only=False, mode='auto', period=1)
+
+csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
+csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
+
+prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
+
+callback_list = [checkpoint, csv_callback]
+
+
+if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
+            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
+
+#########################################
+model = get_model(n_classes=prms.n_classes,
+                  final_activation=prms.final_activation,
+                  ndense=prms.ndense,
+                  #weights = prms.weights,
+                  dropout=prms.dropout,
+                  base_trainable=prms.base_trainable)
+
+
+#from keras.utils import plot_model
+#plot_model(model, to_file='model.png')
+if __name__ == '__main__':
+    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
+                  metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
+                      ],
+                  )
+    #########################################
+    if prms.weightfile:
+        print("loading weights from:\t%s" % prms.weightfile)
+        model.load_weights(prms.weightfile)
+    
+    #########################################
+    print('Using real-time data augmentation.')
+
+    flowfromdir_params = dict(
+        #color_mode = "grayscale",
+        target_size=prms.target_size,
+        batch_size=prms.batch_size,
+        class_mode=prms.class_mode,
+        classes=prms.classes,
+        seed=prms.seed)
+    norm_params = dict(
+            rescale=prms.rescale,
+            samplewise_center=prms.samplewise_center,
+            samplewise_std_normalization=prms.samplewise_center,
+            featurewise_center=False,
+            featurewise_std_normalization=False,
+            zca_whitening=False,
+            z_transform = prms.ztransform,
+            )
+
+    def _ztransform(x):
+        return (x-np.mean(x)) / np.std(x)
+
+    if 'preprocessing_function' in prms:
+        if prms.preprocessing_function=='ztransform':
+            preprocessing_function = _ztransform
+        elif prms.preprocessing_function=='m1p1':
+            preprocessing_function = lambda x: x/128.0 - 1
+        else:
+            raise ValueError("unknown preprocessing_function")
+    else:
+        preprocessing_function = lambda x: x
+
+    if prms.data_augmentation:
+
+        print('Using real-time data augmentation.')
+        train_datagen = ImageDataGenerator(
+            zoom_range=prms.zoom_range,
+            fill_mode=prms.fill_mode,
+            rotation_range = prms.rotation_range,
+            width_shift_range = prms.width_shift_range,
+            height_shift_range = prms.height_shift_range,
+            horizontal_flip=prms.horizontal_flip,
+            vertical_flip=prms.vertical_flip,
+            contrast = prms.contrast if "contrast" in prms else None,
+            truncate_quantile = prms.truncate_quantile,
+            #histeq_alpha=prms.histeq_alpha,
+            **norm_params)
+    else:
+        train_datagen = ImageDataGenerator(**norm_params)
+
+
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        shuffle=False, **flowfromdir_params)
+
+    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
+
+    ##########################################
+    # HOLDOUT
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params)
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_holdout, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_holdout.csv", index=False)
+    ##########################################
+    # HOLDOUT FLIPPED
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_holdout, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
+    #########################################
+    # VAL
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_test.csv", index=False)
+    #########################################
+    # VAL FLIPPED
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_test_fliplr.csv", index=False)
+    #########################################
+    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
+    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
+
+    print('='*50)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    yhat = model.predict_generator(datagen_train_output,
+                          steps=STEPS_PER_EPOCH,
+                          verbose=1,)
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
+    ##ipdb.set_trace()
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_train.csv", index=False)
+
+    #########################################
+    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
+    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
+
+    print('='*50)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        shuffle=False, **flowfromdir_params)
+
+    yhat = model.predict_generator(datagen_train_output,
+                          steps=STEPS_PER_EPOCH,
+                          verbose=1,)
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
+    ##ipdb.set_trace()
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_train_filplr.csv", index=False)
@@ -0,0 +1,50 @@
+ReduceLROnPlateau:
+  cooldown: 8
+  epsilon: 0.001
+  factor: 0.5
+  min_lr: 1.0e-12
+  mode: auto
+  monitor: val_loss
+  patience: 64
+  verbose: 0
+base_trainable: false
+batch_size: 16
+class_mode: categorical
+class_weights:
+- 1
+- 1
+classes:
+- normal
+- wire
+data_augmentation: true
+data_everything: /media/exx/tron/2017-07-png-jae/
+data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
+data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
+data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
+dropout: 0.5
+fill_mode: reflect
+final_activation: softmax
+height_shift_range: 0.125
+horizontal_flip: true
+init_epoch: 0
+lr: 0.001
+n_classes: 2
+nb_epoch: 500
+ndense: 0
+oversampling: false
+rescale: 1
+rotation_range: 30
+samplewise_center: false
+seed: 2
+target_side: 299
+target_size:
+- 299
+- 299
+truncate_quantile: null
+vertical_flip: false
+weightfile: model.147-0.000774.hdf5
+width_shift_range: 0.125
+zoom_range:
+- 0.8
+- 1.2
+ztransform: true
@@ -0,0 +1,398 @@
+import sys
+import pandas as pd
+sys.path.append('../..')
+sys.path.append("/data/dlituiev/kerastrainutils/")
+
+from inception_short import get_model, get_num_files, get_class_weights
+from keras.optimizers import Adam
+from _image import ImageDataGenerator
+#from keras.preprocessing.image import ImageDataGenerator
+from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
+from checkpoint_utils import CSVWallClockLogger
+from shutil import copy2
+from losses import acc_0, acc_1, acc_2, acc_3, acc_4
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+import sys
+import os
+import yaml
+import numpy as np
+import keras
+from hashlib import md5
+os.environ["PYTHONHASHSEED"]='0'
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
+os.environ["CUDA_VISIBLE_DEVICES"]="3"
+
+prms = AttrDict(
+    dropout=0.5,
+    base_trainable=False,
+    horizontal_flip = True,
+    vertical_flip = False,
+    zoom_range = [0.8, 1.2],
+    rotation_range = 30,
+    fill_mode='reflect',
+    ndense=0,
+    batch_size = 16,
+    init_epoch=0,
+    nb_epoch = 500,
+    data_augmentation = True,
+    rescale = 1, #2**-8,
+    #contrast = 0.9,
+    truncate_quantile = None,#0.001,
+    ztransform = True,
+    oversampling = False,
+    #sampling_factor = [1, 4],
+    seed=2,
+    width_shift_range = 0.125,
+    height_shift_range = 0.125,
+    class_mode =  'categorical', # 'binary', #
+    n_classes = 2,
+    final_activation = "softmax", # 'sigmoid',
+    lr = 1e-3,
+    samplewise_center = False, #True
+    target_side = 299,
+    #weights = None,
+    weightfile = "model.147-0.000774.hdf5",
+    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
+    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
+    data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
+    data_everything = "/media/exx/tron/2017-07-png-jae/",
+    classes = ["normal", "wire"],
+    class_weights=[1, 1],
+    ReduceLROnPlateau = dict(
+        monitor='val_loss',
+        factor=1/2,
+        patience=32*2,
+        verbose=0,
+        mode='auto', epsilon=0.001,
+        cooldown=8,
+        min_lr=1e-12,
+        ),
+)
+
+
+paramhash = md5(str(prms).encode()).hexdigest()
+
+prms["target_size"] = [ prms.target_side ]*2
+
+CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
+os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+print("SAVING TO:\t%s" % CHECKPOINT_DIR)
+# copy the script to the checkpoint directory
+copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
+with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
+    yaml.dump(dict(prms), outfh, default_flow_style=False)
+# w_categorical_crossentropy
+CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
+
+SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
+STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
+
+print('='*50)
+print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+print('='*50)
+#########################################
+checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
+        save_best_only=True, save_weights_only=False, mode='auto', period=1)
+
+csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
+csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
+
+prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
+
+callback_list = [checkpoint, csv_callback]
+
+
+if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
+            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
+
+#########################################
+model = get_model(n_classes=prms.n_classes,
+                  final_activation=prms.final_activation,
+                  ndense=prms.ndense,
+                  #weights = prms.weights,
+                  dropout=prms.dropout,
+                  base_trainable=prms.base_trainable)
+
+
+#from keras.utils import plot_model
+#plot_model(model, to_file='model.png')
+if __name__ == '__main__':
+    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
+                  metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
+                      ],
+                  )
+    #########################################
+    if prms.weightfile:
+        print("loading weights from:\t%s" % prms.weightfile)
+        model.load_weights(prms.weightfile)
+    
+    #########################################
+    print('Using real-time data augmentation.')
+
+    flowfromdir_params = dict(
+        #color_mode = "grayscale",
+        target_size=prms.target_size,
+        batch_size=prms.batch_size,
+        class_mode=prms.class_mode,
+        classes=prms.classes,
+        seed=prms.seed)
+    norm_params = dict(
+            rescale=prms.rescale,
+            samplewise_center=prms.samplewise_center,
+            samplewise_std_normalization=prms.samplewise_center,
+            featurewise_center=False,
+            featurewise_std_normalization=False,
+            zca_whitening=False,
+            z_transform = prms.ztransform,
+            )
+
+    def _ztransform(x):
+        return (x-np.mean(x)) / np.std(x)
+
+    if 'preprocessing_function' in prms:
+        if prms.preprocessing_function=='ztransform':
+            preprocessing_function = _ztransform
+        elif prms.preprocessing_function=='m1p1':
+            preprocessing_function = lambda x: x/128.0 - 1
+        else:
+            raise ValueError("unknown preprocessing_function")
+    else:
+        preprocessing_function = lambda x: x
+
+    if prms.data_augmentation:
+
+        print('Using real-time data augmentation.')
+        train_datagen = ImageDataGenerator(
+            zoom_range=prms.zoom_range,
+            fill_mode=prms.fill_mode,
+            rotation_range = prms.rotation_range,
+            width_shift_range = prms.width_shift_range,
+            height_shift_range = prms.height_shift_range,
+            horizontal_flip=prms.horizontal_flip,
+            vertical_flip=prms.vertical_flip,
+            #contrast = prms.contrast if "contrast" in prms else None,
+            #truncate_quantile = prms.truncate_quantile,
+            #histeq_alpha=prms.histeq_alpha,
+            **norm_params)
+    else:
+        train_datagen = ImageDataGenerator(**norm_params)
+    ##########################################
+    # Everything
+    ##########################################
+
+    val_datagen = ImageDataGenerator(**norm_params)
+    flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+
+    ##########################################
+    ##########################################
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything_fliplr.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+
+    ##########################################
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything_flipud.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+
+    ##########################################
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything_fliplrud.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+    ##########################################
+    # DONE
+    ##########################################
+    sys.exit(1)
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        shuffle=False, **flowfromdir_params)
+
+    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
+
+    ##########################################
+    # HOLDOUT
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params)
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_holdout, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_holdout.csv", index=False)
+    ##########################################
+    # HOLDOUT FLIPPED
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_holdout, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
+    #########################################
+    # VAL
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_test.csv", index=False)
+    #########################################
+    # VAL FLIPPED
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_test_fliplr.csv", index=False)
+    #########################################
+    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
+    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
+
+    print('='*50)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    yhat = model.predict_generator(datagen_train_output,
+                          steps=STEPS_PER_EPOCH,
+                          verbose=1,)
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
+    ##ipdb.set_trace()
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_train.csv", index=False)
+
+    #########################################
+    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
+    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
+
+    print('='*50)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        shuffle=False, **flowfromdir_params)
+
+    yhat = model.predict_generator(datagen_train_output,
+                          steps=STEPS_PER_EPOCH,
+                          verbose=1,)
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
+    ##ipdb.set_trace()
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1 @@
+../inception_short.py
@@ -0,0 +1,398 @@
+import sys
+import pandas as pd
+sys.path.append('../..')
+sys.path.append("/data/dlituiev/kerastrainutils/")
+
+from inception_short import get_model, get_num_files, get_class_weights
+from keras.optimizers import Adam
+from _image import ImageDataGenerator
+#from keras.preprocessing.image import ImageDataGenerator
+from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
+from checkpoint_utils import CSVWallClockLogger
+from shutil import copy2
+from losses import acc_0, acc_1, acc_2, acc_3, acc_4
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+import sys
+import os
+import yaml
+import numpy as np
+import keras
+from hashlib import md5
+os.environ["PYTHONHASHSEED"]='0'
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
+os.environ["CUDA_VISIBLE_DEVICES"]="3"
+
+prms = AttrDict(
+    dropout=0.5,
+    base_trainable=False,
+    horizontal_flip = True,
+    vertical_flip = False,
+    zoom_range = [0.8, 1.2],
+    rotation_range = 30,
+    fill_mode='reflect',
+    ndense=0,
+    batch_size = 16,
+    init_epoch=0,
+    nb_epoch = 500,
+    data_augmentation = True,
+    rescale = 1, #2**-8,
+    #contrast = 0.9,
+    truncate_quantile = None,#0.001,
+    ztransform = True,
+    oversampling = False,
+    #sampling_factor = [1, 4],
+    seed=2,
+    width_shift_range = 0.125,
+    height_shift_range = 0.125,
+    class_mode =  'categorical', # 'binary', #
+    n_classes = 2,
+    final_activation = "softmax", # 'sigmoid',
+    lr = 1e-3,
+    samplewise_center = False, #True
+    target_side = 299,
+    #weights = None,
+    weightfile = "model.147-0.000774.hdf5",
+    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
+    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
+    data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
+    data_everything = "/media/exx/tron/2017-07-png-jae/",
+    classes = ["normal", "wire"],
+    class_weights=[1, 1],
+    ReduceLROnPlateau = dict(
+        monitor='val_loss',
+        factor=1/2,
+        patience=32*2,
+        verbose=0,
+        mode='auto', epsilon=0.001,
+        cooldown=8,
+        min_lr=1e-12,
+        ),
+)
+
+
+paramhash = md5(str(prms).encode()).hexdigest()
+
+prms["target_size"] = [ prms.target_side ]*2
+
+CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
+os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+print("SAVING TO:\t%s" % CHECKPOINT_DIR)
+# copy the script to the checkpoint directory
+copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
+with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
+    yaml.dump(dict(prms), outfh, default_flow_style=False)
+# w_categorical_crossentropy
+CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
+
+SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
+STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
+
+print('='*50)
+print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+print('='*50)
+#########################################
+checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
+        save_best_only=True, save_weights_only=False, mode='auto', period=1)
+
+csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
+csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
+
+prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
+
+callback_list = [checkpoint, csv_callback]
+
+
+if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
+            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
+
+#########################################
+model = get_model(n_classes=prms.n_classes,
+                  final_activation=prms.final_activation,
+                  ndense=prms.ndense,
+                  #weights = prms.weights,
+                  dropout=prms.dropout,
+                  base_trainable=prms.base_trainable)
+
+
+#from keras.utils import plot_model
+#plot_model(model, to_file='model.png')
+if __name__ == '__main__':
+    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
+                  metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
+                      ],
+                  )
+    #########################################
+    if prms.weightfile:
+        print("loading weights from:\t%s" % prms.weightfile)
+        model.load_weights(prms.weightfile)
+    
+    #########################################
+    print('Using real-time data augmentation.')
+
+    flowfromdir_params = dict(
+        #color_mode = "grayscale",
+        target_size=prms.target_size,
+        batch_size=prms.batch_size,
+        class_mode=prms.class_mode,
+        classes=prms.classes,
+        seed=prms.seed)
+    norm_params = dict(
+            rescale=prms.rescale,
+            samplewise_center=prms.samplewise_center,
+            samplewise_std_normalization=prms.samplewise_center,
+            featurewise_center=False,
+            featurewise_std_normalization=False,
+            zca_whitening=False,
+            z_transform = prms.ztransform,
+            )
+
+    def _ztransform(x):
+        return (x-np.mean(x)) / np.std(x)
+
+    if 'preprocessing_function' in prms:
+        if prms.preprocessing_function=='ztransform':
+            preprocessing_function = _ztransform
+        elif prms.preprocessing_function=='m1p1':
+            preprocessing_function = lambda x: x/128.0 - 1
+        else:
+            raise ValueError("unknown preprocessing_function")
+    else:
+        preprocessing_function = lambda x: x
+
+    if prms.data_augmentation:
+
+        print('Using real-time data augmentation.')
+        train_datagen = ImageDataGenerator(
+            zoom_range=prms.zoom_range,
+            fill_mode=prms.fill_mode,
+            rotation_range = prms.rotation_range,
+            width_shift_range = prms.width_shift_range,
+            height_shift_range = prms.height_shift_range,
+            horizontal_flip=prms.horizontal_flip,
+            vertical_flip=prms.vertical_flip,
+            #contrast = prms.contrast if "contrast" in prms else None,
+            #truncate_quantile = prms.truncate_quantile,
+            #histeq_alpha=prms.histeq_alpha,
+            **norm_params)
+    else:
+        train_datagen = ImageDataGenerator(**norm_params)
+    ##########################################
+    # Everything
+    ##########################################
+
+    val_datagen = ImageDataGenerator(**norm_params)
+    flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+
+    ##########################################
+    ##########################################
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything_fliplr.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+
+    ##########################################
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything_flipud.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+
+    ##########################################
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        os.path.dirname(prms.data_everything.rstrip('/')),
+        shuffle=False, **flowfromdir_params)
+
+    VALIDATION_STEPS = len(datagen_val_output) 
+    pred_fn = "predictions_everything_fliplrud.csv"
+    with open(pred_fn, 'w+') as fh:
+        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
+        for ii, batch in enumerate(datagen_val_output):
+            if ii> VALIDATION_STEPS:
+                break
+            yhat =  model.predict_on_batch(batch[0])
+            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
+            for fnimg, yhat_ in zip(filenames, yhat):
+                print(fnimg, *yhat_, sep=',', file = fh)
+    ##########################################
+    # DONE
+    ##########################################
+    sys.exit(1)
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        shuffle=False, **flowfromdir_params)
+
+    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
+
+    ##########################################
+    # HOLDOUT
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params)
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_holdout, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_holdout.csv", index=False)
+    ##########################################
+    # HOLDOUT FLIPPED
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_holdout, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
+    #########################################
+    # VAL
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_test.csv", index=False)
+    #########################################
+    # VAL FLIPPED
+    ##########################################
+    val_datagen = ImageDataGenerator(**norm_params, )
+    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
+    print("validation steps", VALIDATION_STEPS)
+
+    yhat = model.predict_generator(datagen_val_output,
+                          steps=VALIDATION_STEPS,
+                          verbose=1,)
+
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_test_fliplr.csv", index=False)
+    #########################################
+    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
+    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
+
+    print('='*50)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    yhat = model.predict_generator(datagen_train_output,
+                          steps=STEPS_PER_EPOCH,
+                          verbose=1,)
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
+    ##ipdb.set_trace()
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_train.csv", index=False)
+
+    #########################################
+    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
+    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
+
+    print('='*50)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        shuffle=False, **flowfromdir_params)
+
+    yhat = model.predict_generator(datagen_train_output,
+                          steps=STEPS_PER_EPOCH,
+                          verbose=1,)
+
+    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
+    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
+    ##ipdb.set_trace()
+    dfres = pd.DataFrame(dfdict)
+    dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1,224 @@
+from inception_short import get_model, get_num_files, get_class_weights
+from keras.optimizers import Adam
+from image import ImageDataGenerator
+#from keras.preprocessing.image import ImageDataGenerator
+from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
+from checkpoint_utils import CSVWallClockLogger
+from shutil import copy2
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+import sys
+import os
+import yaml
+import numpy as np
+import keras
+from hashlib import md5
+os.environ["PYTHONHASHSEED"]='0'
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
+os.environ["CUDA_VISIBLE_DEVICES"]="1"
+
+prms = AttrDict(
+    dropout=0.5,
+    base_trainable=False,
+    horizontal_flip = True,
+    vertical_flip = False,
+    zoom_range = [0.8, 1.2],
+    rotation_range = 30,
+    fill_mode='reflect',
+    ndense=0,
+    batch_size = 16,
+    init_epoch=0,
+    nb_epoch = 500,
+    data_augmentation = True,
+    rescale = 1, #2**-8,
+    #contrast = 0.9,
+    truncate_quantile = None,#0.001,
+    ztransform = True,
+    oversampling = False,
+    #sampling_factor = [1, 4],
+    seed=1,
+    width_shift_range = 0.125,
+    height_shift_range = 0.125,
+    class_mode =  'categorical', # 'binary', #
+    n_classes = 2,
+    final_activation = "softmax", # 'sigmoid',
+    lr = 1e-3,
+    samplewise_center = False, #True
+    target_side = 299,
+    #weights = None,
+    weightfile = None, #"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.31-1.290145.hdf5",
+    #"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.59-1.676424.hdf5",
+    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
+    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
+    classes = ["normal", "wire"],
+    class_weights=[1, 1],
+    ReduceLROnPlateau = dict(
+        monitor='val_loss',
+        factor=1/2,
+        patience=32*2,
+        verbose=0,
+        mode='auto', epsilon=0.001,
+        cooldown=8,
+        min_lr=1e-12,
+        ),
+)
+
+
+paramhash = md5(str(prms).encode()).hexdigest()
+
+prms["target_size"] = [ prms.target_side ]*2
+
+CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
+os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+print("SAVING TO:\t%s" % CHECKPOINT_DIR)
+# copy the script to the checkpoint directory
+copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
+with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
+    yaml.dump(dict(prms), outfh, default_flow_style=False)
+prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
+
+CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
+
+SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
+STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
+
+print('='*50)
+print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+print('='*50)
+#########################################
+checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
+        save_best_only=True, save_weights_only=False, mode='auto', period=1)
+
+csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
+csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
+
+
+callback_list = [checkpoint, csv_callback]
+
+
+if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
+            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
+
+#########################################
+model = get_model(n_classes=prms.n_classes,
+                  final_activation=prms.final_activation,
+                  ndense=prms.ndense,
+                  #weights = prms.weights,
+                  dropout=prms.dropout,
+                  base_trainable=prms.base_trainable)
+
+
+#from keras.utils import plot_model
+#plot_model(model, to_file='model.png')
+
+if __name__ == '__main__':
+    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
+                  metrics=['accuracy', #acc_0, acc_1,# acc_2, acc_3, acc_4
+                      ],
+                  )
+    #########################################
+    if prms.weightfile:
+        print("loading weights from:\t%s" % prms.weightfile)
+        model.load_weights(prms.weightfile)
+    
+    #########################################
+    print('Using real-time data augmentation.')
+
+    flowfromdir_params = dict(
+        #color_mode = "grayscale",
+        target_size=prms.target_size,
+        batch_size=prms.batch_size,
+        class_mode=prms.class_mode,
+        classes=prms.classes,
+        seed=prms.seed)
+    norm_params = dict(
+            rescale=prms.rescale,
+            samplewise_center=prms.samplewise_center,
+            samplewise_std_normalization=prms.samplewise_center,
+            featurewise_center=False,
+            featurewise_std_normalization=False,
+            zca_whitening=False,
+            z_transform = prms.ztransform,
+            )
+
+    def _ztransform(x):
+        return (x-np.mean(x)) / np.std(x)
+
+    if 'preprocessing_function' in prms:
+        if prms.preprocessing_function=='ztransform':
+            preprocessing_function = _ztransform
+        elif prms.preprocessing_function=='m1p1':
+            preprocessing_function = lambda x: x/128.0 - 1
+        else:
+            raise ValueError("unknown preprocessing_function")
+    else:
+        preprocessing_function = lambda x: x
+
+
+    if prms.data_augmentation:
+
+        print('Using real-time data augmentation.')
+        train_datagen = ImageDataGenerator(
+            zoom_range=prms.zoom_range,
+            fill_mode=prms.fill_mode,
+            rotation_range = prms.rotation_range,
+            width_shift_range = prms.width_shift_range,
+            height_shift_range = prms.height_shift_range,
+            horizontal_flip=prms.horizontal_flip,
+            vertical_flip=prms.vertical_flip,
+            contrast = prms.contrast if "contrast" in prms else None,
+            truncate_quantile = prms.truncate_quantile,
+            #histeq_alpha=prms.histeq_alpha,
+            **norm_params)
+    else:
+        train_datagen = ImageDataGenerator(**norm_params)
+
+    val_datagen = ImageDataGenerator(**norm_params)
+
+    datagen_train_output = train_datagen.flow_from_directory(
+        prms.data_train, 
+        stratify = prms.oversampling,
+        sampling_factor=prms.sampling_factor if prms.oversampling else None,
+        oversampling=prms.oversampling,
+        shuffle=True, **flowfromdir_params)
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+
+    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
+    VALIDATION_STEPS = np.ceil(len(datagen_val_output.filenames)/prms['batch_size'])
+    print("validation steps", VALIDATION_STEPS)
+    #########################################
+    if prms.class_weights == 'auto':
+        class_weights = get_class_weights(datagen_val_output)
+    else:
+        class_weights = prms.class_weights
+
+    model.fit_generator(datagen_train_output,
+                          steps_per_epoch=STEPS_PER_EPOCH,
+                          epochs=prms.nb_epoch, verbose=1,
+                          validation_data=datagen_val_output,
+                          validation_steps=VALIDATION_STEPS,
+                          #class_weight='auto',
+                          class_weight=class_weights,
+                          callbacks=callback_list,
+                          initial_epoch=prms.init_epoch)
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        prms.data_val, shuffle=False, **flowfromdir_params)
+
+    print("""loss\t%.4f
+    accuracy\t%.4f\n""" %
+      tuple(model.evaluate_generator(datagen_val_output,
+                                     steps=VALIDATION_STEPS,
+                                     workers=1,
+                                    pickle_safe=True)))
+
+
+    #model.predict()
@@ -0,0 +1,245 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  9 11:00:55 2017
+
+@author: dlituiev
+"""
+
+import os
+from collections import Counter
+from functools import partial
+from itertools import product
+
+import keras
+from keras.applications.inception_v3 import InceptionV3
+from keras.preprocessing import image
+from keras.models import Model
+from keras.layers import Dense, GlobalAveragePooling2D, GaussianNoise, Input
+from keras import backend as K
+from keras.preprocessing.image import ImageDataGenerator
+from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping
+from keras.layers import Dense, Dropout, Activation, Flatten, Lambda, BatchNormalization, Input
+from keras.optimizers import Adam
+
+#########################################
+def get_num_files(parentdir):
+    numfiles = 0
+    for dd in os.scandir(parentdir):
+        dd = os.path.join(parentdir, dd)
+        if os.path.isdir(dd):
+            numfiles+= sum((1 for ff in os.scandir(dd)))
+    return numfiles
+#########################################
+#########################################
+#          SET UP THE NETWORK
+#########################################
+def get_model(n_classes, final_activation,
+              ndense=512, dropout=0.5,
+              weights='imagenet',
+              input_shape = [None, None, 3],
+              gaussian_noise_sigma = None,
+              input_tensor = None,
+              base_trainable=False):
+
+    if input_shape:
+        input_tensor = Input(shape = input_shape)
+    if gaussian_noise_sigma is not None:
+        input_tensor = GaussianNoise(gaussian_noise_sigma)(input_tensor)
+    # create the base pre-trained model
+    base_model = InceptionV3(weights=weights, include_top=False,
+                             input_tensor = input_tensor,
+                            )
+    # get third Concatenation layer and crop the network on it:
+    cc=0
+    poptherest = False
+    for nn, la in enumerate(base_model.layers):
+        if type(la) is keras.layers.Concatenate:
+            if cc==3:
+                x = la.output
+                break
+            cc+=1
+    base_model.layers = base_model.layers[:nn+1]
+
+    #x = [la.output for la in base_model.layers if type(la) is keras.layers.Concatenate][3]
+    x = GlobalAveragePooling2D()(x)
+    # let's add a fully-connected layer
+    x = Dropout(dropout)(x)
+
+    if ndense>0:
+        x = Dense(ndense, activation='relu')(x)
+    # and a logistic layer -- let's say we have 200 classes
+    predictions = Dense(n_classes, activation=final_activation)(x)
+
+    # this is the model we will train
+    model = Model(inputs=base_model.input, outputs=predictions)
+
+    # first: train only the top layers (which were randomly initialized)
+    # i.e. freeze all convolutional InceptionV3 layers
+    if not base_trainable:
+        for layer in base_model.layers:
+            layer.trainable = False
+
+    last_module_index = [nn for nn,la  in enumerate(model.layers) if type(la) is keras.layers.Concatenate][-2]
+
+    for layer in model.layers[last_module_index:]:
+        layer.trainable = True
+    return model
+
+
+def get_class_weights(datagen_val_output):
+    counter = Counter(datagen_val_output.classes)
+    print("distribution of labels in {}:\n{}".format(datagen_val_output.directory, str(counter)))
+    for kk,vv in counter.items():
+        counter[kk] = vv+1
+
+    max_val = float(max(counter.values()))
+
+    class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}                     
+    return class_weights
+
+
+
+def w_categorical_crossentropy(weights):
+    def _w_categorical_crossentropy(y_true, y_pred, weights):
+        nb_cl = len(weights)
+        final_mask = K.zeros_like(y_pred[:, 0])
+        y_pred_max = K.max(y_pred, axis=1)
+        y_pred_max = K.expand_dims(y_pred_max, 1)
+        y_pred_max_mat = K.equal(y_pred, y_pred_max)
+        for c_p, c_t in product(range(nb_cl), range(nb_cl)):
+
+            final_mask += (K.cast(weights[c_t, c_p],K.floatx()) *
+                           K.cast(y_pred_max_mat[:, c_p] ,K.floatx()) *
+                           K.cast(y_true[:, c_t],K.floatx())
+                          )
+        return K.categorical_crossentropy(y_pred, y_true) * final_mask
+
+    ncce = partial(_w_categorical_crossentropy, weights=weights)
+    ncce.__name__ ='w_categorical_crossentropy'
+    return ncce
+
+
+if __name__ == '__main__':
+    import numpy as np
+    import keras
+    #csv_path = CHECKPOINTS_BASE + ".log.csv"
+    #csv_callback = keras.callbacks.CSVLogger(csv_path, separator=',', append=False)
+    os.environ['KERAS_BACKEND'] = 'tensorflow'
+    os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
+    os.environ["CUDA_VISIBLE_DEVICES"] = '2'
+
+    NDENSE=256 #512
+    BATCH_SIZE = 128
+    NB_EPOCH = 20
+    DATA_AUGMENTATION = True
+    SEED=0
+    CLASS_MODE = 'binary' # 'categorical'
+    LOSS = '{}_crossentropy'.format(CLASS_MODE)
+    N_CLASSES = 1
+    FINAL_ACTIVATION = 'sigmoid'
+    LR = 0.0001
+    SAMPLEWISE_CENTER = False #True
+
+    TARGET_SIDE = 99
+    TARGET_SIZE = [TARGET_SIDE]*2
+
+    BASE_TRAINABLE=False
+    CHECKPOINT_DIR = "./modelstate_withx_negloglr{:d}_ndense{:d}_imsize{:d}{}/" .format(
+                    int(-np.log10(LR)),
+                    NDENSE,
+                    TARGET_SIDE,
+                    "" if not BASE_TRAINABLE else "_base_trainable"
+                    )
+    CHECKPOINT_PATH = CHECKPOINT_DIR + 'model.{epoch:02d}-{val_loss:2f}.hdf5'
+
+    WEIGHTFILE = None # "./modelstate_withx_negloglr4_ndense256/model.39-0.060567.hdf5" # None # "./modelstate_withx/model.03-0.067136.hdf5"
+    # "modelstate_laplace_inv_weights_2/model.10-0.014968.hdf5" #CHECKPOINT_DIR + "model.10-0.019602.hdf5"
+    INIT_EPOCH=0
+    # indir = "/data/dlituiev/learn_spotmag_from_images/modelstate/"
+    # find_min_loss_checkpoint(indir)
+
+
+    DATA_TRAIN = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_train/'
+    DATA_VAL = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_test/'
+    SAMPLES_PER_EPOCH = get_num_files(DATA_TRAIN)
+    STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // BATCH_SIZE
+
+    CLASSES = ["normal", "special"]
+
+    VALIDATION_STEPS = get_num_files(DATA_VAL) // BATCH_SIZE
+    print('='*50)
+    print("validation steps", VALIDATION_STEPS)
+    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
+    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
+    print('='*50)
+    #########################################
+    os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)
+    checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
+            save_best_only=False, save_weights_only=False, mode='auto', period=1)
+    callbacks_list =[checkpoint]
+
+    #########################################
+    model = get_model(n_classes=N_CLASSES,
+                      final_activation=FINAL_ACTIVATION,
+                      ndense=NDENSE,
+                      dropout=0.5,
+                      base_trainable=BASE_TRAINABLE)
+
+
+    #from keras.utils import plot_model
+    #plot_model(model, to_file='model.png')
+
+
+    model.compile(optimizer=Adam(lr=LR), loss=LOSS, metrics=['accuracy'],
+                  callbacks = [csv_callback])
+    #########################################
+    if WEIGHTFILE:
+        print("loading weights from:\t%s" % WEIGHTFILE)
+        model.load_weights(WEIGHTFILE)
+
+    print('Using real-time data augmentation.')
+
+    flowfromdir_params = dict(
+        #color_mode = "grayscale",
+        target_size=TARGET_SIZE,
+        batch_size=BATCH_SIZE,
+        class_mode=CLASS_MODE,
+        classes=CLASSES,
+        seed=SEED)
+
+    train_datagen = ImageDataGenerator(
+        samplewise_center=SAMPLEWISE_CENTER,
+        samplewise_std_normalization=SAMPLEWISE_CENTER,
+        featurewise_center=False,
+        featurewise_std_normalization=False,
+        zca_whitening=False,
+        rotation_range=10,
+        width_shift_range=0.125,
+        height_shift_range=0.125,
+        horizontal_flip=True,
+        vertical_flip=False)
+
+    val_datagen = ImageDataGenerator()
+
+    datagen_train_output = train_datagen.flow_from_directory(
+        DATA_TRAIN, shuffle=True, **flowfromdir_params)
+
+    datagen_val_output = val_datagen.flow_from_directory(
+        DATA_VAL, shuffle=False, **flowfromdir_params)
+
+    class_weights = get_class_weights(datagen_val_output)
+
+    model.fit_generator(datagen_train_output,
+                          steps_per_epoch=STEPS_PER_EPOCH,
+                          epochs=NB_EPOCH, verbose=1,
+                          validation_data=datagen_val_output,
+                          validation_steps=VALIDATION_STEPS,
+                          #class_weight='auto',
+                          class_weight=class_weights,
+                          callbacks=callbacks_list,
+                          initial_epoch=INIT_EPOCH)
+
+
+
+    #model.predict()
@@ -0,0 +1,23 @@
+Cython==0.27.3
+h5py==2.7.0
+imgaug==0.2.5
+Keras==2.0.8
+-e git+https://github.com/raghakot/keras-vis@40b27dfa3ecb84cdde5ec6b44251923c3266cc40#egg=keras_vis
+lime==0.1.1.29
+matplotlib==2.0.2
+mudicom==0.1.2
+numpy==1.14.0
+opencv-python==3.3.0.10
+pandas==0.20.2
+Pillow==4.1.1
+pyaml==17.7.2
+-e git+https://github.com/cocodataset/cocoapi/@727b546dd9fa4e4bb113213c98a3925829fac0bf#egg=pycocotools&subdirectory=PythonAPI
+pydicom==0.9.9
+PyYAML==3.12
+scikit-image==0.13.0
+scikit-learn==0.18.1
+scipy==0.19.1
+seaborn==0.7.1
+sklearn==0.0
+tensorflow-gpu==1.4.1
+tensorflow-tensorboard==0.4.0rc3