initial

2026-06-27 16:10:25 +08:00 · 2018-10-12 17:38:36 -07:00
commit 0fede818d7
26 changed files with 12311 additions and 0 deletions
@@ -0,0 +1,3 @@
 **/*.hdf5
 **/*.csv
@@ -0,0 +1,42 @@
 # Code for automatic labeling of special diagnostic mammography views from images and DICOM headers
 ## DICOM
 ### Extract selected fields from DICOM headers
    dicom_header_extraction/extract_dicom_headers_w_generator_150K.py
 ### Normalize / expand data
    dicom_header_extraction/normalize_selected_dcm_headers.py
 ###  Machine learning on DICOM headers
    caret_on_headers.R       # most methods 
    caret_on_headers_nona.R  # GLMNET
 ## Image pipeline
 ### General image model
 - scripts and config files: `image_classifiers/e5ce2d69b035975cb5336cec0da9a32a`
 - weight files:
 ### Wire localization model
 - scripts and config files: `image_classifiers/e8e71fc090141d7c6fb334359152d295`
 - weight files:
 ## Visualization of performance metrics 
 Scripts used to generate Fig. 1
    combine_predictions_hdr_and_img.ipynb
    visualize_predictions_hdr_and_img.ipynb
 ## Significance tests
 Scripts used to generate Supplementary Figures S1 & S2
    calc_auroc_confidence_intervals.R
    plot_auroc_difference_pvalue.ipynb
@@ -0,0 +1,169 @@
 rm(list=ls())
 library(pROC)
 library(ggplot2)
 library(ggsignif)
 library(dplyr)
 library(data.table)
 read.gz <- function(filename, ...){
  as.data.frame(fread(paste("zcat < ",filename),
                      header=TRUE,  fill = TRUE, ...))
 }
 tag <- "e5ce2d69b035975cb5336cec0da9a32a"
 fnall <- "../tables/all_predictions_with_images.tab"
 fnall <- paste0("../tables/all_predictions_with_images-", tag,".tab")
 predictions <- as.data.frame(fread(fnall, sep='\t'), header=TRUE,  fill = TRUE)
 labelled <- sapply(predictions$label, function(x) nchar(x)>0)
 print(nrow(predictions[labelled,]))
 predictions <- predictions[labelled,]
 predictions[,'ViewModifier'] <- as.numeric(predictions[,'ViewModifier']!='')
 predictions[, "label"] <- factor(predictions[, "label"], c('normal', 'special'))
 predictions[,"view"] <- factor(predictions[,"view"], c('N','M','T','W','X'))
 head(predictions)
 # holdout <- predictions[predictions$set == 'val',]
 ggplot(holdout, aes(view, `score_max_wire_image+gbmt`)) + geom_point()
 validation <- predictions[predictions$set == 'test',]
 clmns <- colnames(predictions)
 othercols <- c('id', 'set', 'view', 'label')
 modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
                'image',
                'image_max',
                'wire',
                'wire_max',
                'max_image_wire_max',
                'image+gbmt',
                'max_wire_max_image+gbmt',
                'max_image_wire',
                'max_wire_image+gbmt')
 clean_score_names <- function(x){
  return( gsub('score_', '', x) )
  # paste(strsplit(x, '_')[[1]][-1],collapse='_')
 }
 clmns_clean <-  vapply(clmns, clean_score_names, '')
 cols_ <-  factor(vapply(colnames(predictions) , clean_score_names, ''),
                 c(othercols,modelnames))
 colnames(validation) <-  cols_
 validation <- validation[,!is.na(colnames(validation))]
 cols_ <- cols_[!is.na(cols_)]
 cols_ <- cols_[order(cols_)]
 validation <- validation[,as.character(cols_)]
 colnames(validation)
 # clmns <-clmns[vapply(clmns, function(x) strsplit(x, '_')[[1]][1]=='score', TRUE)]
 ## Perform McNemars test for prediction difference ----------------------------------------------------
 mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$max_image_wire_max>0.5))
 mcnemar.test(table(validation$`max_wire_max_image+gbmt`>0.5, validation$gbmt>0.5))
 ## Calculate significance of pairwise auROC differences -----------------------------------------------
 cis <- list()
 rocobjects <- list()
 ii <- 0
 for (clmn in modelnames){
  # ii = 1
  print('====================')
  print(clmn)
  rocobj   <- plot.roc(  validation[, "label"],
                         validation[,clmn],
                         levels = (levels(validation[, "label"])),
                         xlim = c(100,0),
                         ylim = c(0,100),
                         percent=TRUE,
                         print.auc=TRUE)
  rocobjects[[clmn]] <- rocobj
  cis[[clmn]] <- ci(rocobj, of="auc", thresholds="best")
 }
 ## Wire model on wire cases
 for (clmn in c('wire', 'wire_max')){
  print('====================')
  print(clmn)
  rocobj   <- plot.roc(  validation[, "view"]=='W',
                         validation[,clmn],
                         # levels = (levels(validation[, "label"])),
                         xlim = c(100,0),
                         ylim = c(0,100),
                         percent=TRUE,
                         print.auc=TRUE)
  rocobjects[[clmn]] <- rocobj
  cis[[paste0(clmn, ' (vs other views)')]] <- ci(rocobj, of="auc", thresholds="best")
 }
 ###
 modelnames <- c('ViewModifier', 'rpart', 'gbm', 'glmnet','xgb', 'gbmt',
                'image', "image_max",
                'wire', 'wire_max',
                'wire (vs other views)', 'wire_max (vs other views)',
                'max_image_wire_max',
                'image+gbmt',
                'max_wire_max_image+gbmt')
 ##
 dfcis <- as.data.frame(t(do.call(cbind.data.frame, lapply(cis, as.vector))))
 colnames(dfcis) <- c('lower', 'auROC', 'upper')
 dfcis[,"model"] <- factor(rownames(dfcis),
                           modelnames)
 dfcis <-  dfcis[!is.na(dfcis[,"model"]),]
 rownames(dfcis) <- dfcis[,"model"] 
 dfcis <- dfcis[modelnames,]
 # dfcis <-dfcis %>% mutate(model = factor(model, levels=rev(levels(model))))
 dfcis_nowire <- dfcis[!(rownames(dfcis) %in% c('wire','wire_max')),]
 dfcis_nowire$model <-  factor(dfcis_nowire$model)
 # 
 # 
 # annotation_df <- data.frame(color=c("E", "H"), 
 #                             start=c("Good", "Fair"), 
 #                             end=c("Very Good", "Good"),
 #                             y=c(3.6, 4.7),
 #                             label=c("Comp. 1", "Comp. 2"))
 roc.test(rocobjects[["ViewModifier"]], rocobjects[["gbmt"]])
 ## Format Pairwise comparisons
 keys <- names(rocobjects)
 dfcompar <- data.frame()
 for (a in 1:length(rocobjects)){
  for (b in 1:a){
    na <- keys[a]
    nb <- keys[b]
    if ((as.numeric(rocobjects[[na]]$auc)==100)||(as.numeric(rocobjects[[nb]]$auc)==100)){
      dfcompar[na, nb] <- NA
    } else {
      dfcompar[na, nb] <- roc.test(rocobjects[[na]], rocobjects[[nb]], method='delong')$p.value
    }
  }
 }
 fn.comparison <- paste0("../tables/auroc_delong_comparison-", tag,".csv")
 write.csv(dfcompar, file=fn.comparison)
@@ -0,0 +1,284 @@
 # coding: utf-8
 rm(list=ls())
 library(caret)
 library(gbm3)
 library(data.table)
 library(ggplot2)
 library(fastmatch)
 read.gz <- function(filename, ...){
  as.data.frame(fread(paste("zcat < ",filename),
                            header=TRUE,  fill = TRUE, ...))
 }
 TABLEDIR = "../tables/"
 fn_ids = paste(TABLEDIR,
               "2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz", sep='/')
 ids = read.gz(fn_ids, select="id")$id
 fn_features = paste(TABLEDIR, "mammo_dicom_headers/df_all_mammos_dicom_headers_selected_expanded.tab.gz", sep='/')
 dffeatures = read.gz(fn_features, sep='\t')
 print(nrow(dffeatures))
 print(length(ids))
 dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
 dffeatures <- dffeatures[!is.na(dffeatures$filename),]
 rm(ids)
 # Data formatting -----------------------------------------
 collist = c("BodyPartThickness", "XRayTubeCurrentInuA",  "ContentTime",
            "DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
 for (cc in collist){
    dffeatures[,cc] <- as.numeric(dffeatures[,cc])
 }
 dtypes = sapply(dffeatures, class)
 names(dtypes[dtypes == 'character'])
 row.names(dffeatures) = dffeatures$filename
 excludeCols <- c("filename",
                 "CollimatorLeftVerticalEdge",
                 "CollimatorLowerHorizontalEdge",
                 "DistanceSourceToEntrance",
                 "ExposuresOnDetectorSinceLastCalibration",
                 "ExposuresOnDetectorSinceManufactured",
                 "ShutterLowerHorizontalEdge",     
                 "ShutterRightVerticalEdge",
                 "XRayTubeCurrentInuA"
                 # "ManufacturerModelName"
                )
 dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
 catcols <- c('ViewModifierCodeMeaning',
            'ViewCodeValue',
            'DetectorActiveDimensionsMissing',
            'FieldOfViewOriginMissing',
            'Grid',
            'Manufacturer',
            'ManufacturerModelName')
 for (cc in catcols){
  dffeatures[,cc] = as.factor(dffeatures[,cc])
 }
 #cell#
 colSums(sapply(dffeatures, is.na))
 # Read labels  --------------------------------
 fn.labelledset = paste(TABLEDIR, "spotmag_predictions/train_test_split-2018-02-15-within7e5.csv", sep='/')
 # filelist.labelled = read.table(fn.labelledset, )
 df.labelled = as.data.frame(fread(fn.labelledset))
 rownames(df.labelled) <- df.labelled$id
 vec.labelled = df.labelled$id
 df.labelled$label <- as.factor(df.labelled$label)
 #cell#
 vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
 vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
 vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
 ############################################################
 dffeatures.labelled <- dffeatures[vec.labelled,]
 dffeatures.labelled$label  <- df.labelled$label
 #cell#
 dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
 dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
 dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
 colnames(dffeatures.labelled.tr_set)
 for (cc in colnames(dffeatures.labelled.tr_set)){
  if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
    setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
    if (length(setdiff_)>0){
      print(cc)
      print(setdiff_)
    }
  }
 }
 # GBM3 ----------------------------------------
 par_detail <- gbmParallel(num_threads = 4) # Pass to par_details in gbmt
 gbmt_fit <- gbmt(label ~ .,
                  data = dffeatures.labelled.tr_set,
                  cv_folds = 10,
                  # training_params = training_params(num_trees = 100, 
                  #                                   interaction_depth = 1,
                  #                                 min_num_obs_in_node = 10, 
                  #                                 shrinkage = 0.005, 
                  #                                 bag_fraction = 0.5,
                  #                                 num_features = 2),
                  keep_gbm_data = TRUE,
                  par_detail=par_detail)
 best_iter_cv <- gbmt_performance(gbmt_fit, method='cv')
 plot(best_iter_cv)
 best.iter.oob <- gbmt_performance(gbmt_fit,method="OOB")  # returns out-of-bag estimated best number of trees
 plot(best.iter.oob)
 saveRDS(gbmt_fit, sprintf("gbm3_ntrees_%d_%s.rds", best_iter_cv, Sys.Date()))
 ## Feature Importance Plotting ----------------
 infl_gbmt <- (as.data.frame(relative_influence(gbmt_fit, best_iter_cv, rescale=T)))
 colnames(infl_gbmt) <- "relative influence"
 infl_gbmt[,"variable"] <- rownames(infl_gbmt)
 infl_gbmt = infl_gbmt[infl_gbmt$`relative influence` >0,]
 plimp <- ggplot(data=infl_gbmt) +
  geom_segment(size=5, colour='blue') + 
  aes(x=reorder(variable,`relative influence`),
      xend = variable,
      y = 2e-6,
      yend=`relative influence`,
      label=`relative influence`) +
  scale_y_log10() + 
  # coord_cartesian(ylim= c(0.8e-6, 1.05)) +
  ylab("relative influence") + xlab("") +
  coord_flip() +
  theme(axis.text.y = element_text(colour="black",size=16,angle=0,face="plain"),
        axis.text.x = element_text(colour="black",size=16,angle=0,face="plain"),
        axis.title.x = element_text(colour="black",size=16,angle=0,face="plain"),
        # panel.background = element_rect(fill = "transparent"), # bg of the panel
        #plot.background = element_rect(fill = "transparent"), # bg of the plot
        # panel.grid.major = element_blank(), # get rid of major grid
         # , panel.grid.minor = element_blank() # get rid of minor grid
          , legend.background = element_rect(fill = "transparent") # get rid of legend bg
          , legend.box.background = element_rect(fill = "transparent") # get rid of legend panel bg
        )
 plimp + coord_trans(limy= c(0.5e-6, 1.05)) + coord_flip()
 plimp + ggsave("img/xgbt_importances.eps", device = 'eps', bg = "transparent",
               width = 8, height = 6, dpi = 300, units = "in" )
 plimp + ggsave("img/xgbt_importances.png", device = 'png', bg = "transparent",
               width = 8, height = 6, dpi = 300, units = "in" )
 dffeatures[,"predictions_gbmt"] = predict(gbmt_fit, newdata = dffeatures,
                                          n.trees = best_iter_cv,
                                          type = "response", na.action = na.pass)
 # GBM-CARET ---------------------------------------------------
 control <- trainControl(method = "cv",
                        number = 10, 
                        p =.8, 
                        savePredictions = TRUE, 
                        classProbs = TRUE, 
                        summaryFunction = twoClassSummary)
 tuneGrid <- expand.grid(n.trees = c(80,100,120,140,160),
            shrinkage=c(0.025, 0.05, 0.1, 0.2),
            interaction.depth = c(1,2),
            n.minobsinnode = c(10, 15))
 gbmFit1 <- train(label ~ .,
                 data = dffeatures.labelled.tr_set, 
                 method = "gbm",
                 na.action = na.pass,
                 tuneGrid=tuneGrid,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 metric = "ROC",
                 trControl = control,
                 # importance = TRUE,
                 verbose = FALSE)
 gbmFit1
 ## Feature Importance Plotting ---------------------------------------------
 gbmsmmry <- summary(gbmFit1, normalize=T, plotit=F)
 gbmsmmry <- gbmsmmry[gbmsmmry$rel.inf>0,]
 ggplot(data=gbmsmmry) +
  geom_segment(size=3, colour='red') + 
  aes(x=reorder(var,rel.inf, sum),
      xend = var,
      y = 0.002,
      yend=(rel.inf),
      label=rel.inf) +
  scale_y_log10() + 
  ylab("relative influence") + xlab("") +
  coord_flip()
 saveRDS(gbmFit1, "gbm_ntrees80_interactiondepth2_shrinkage0.2_nminobsinnode15_trainset_2018-02-18.rds")
 dffeatures[,"predictions_gbm"] = predict(gbmFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
 # RPART -----------------------------------------------------------------
 tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
 rpartFit1 <- train(label ~ ., data = dffeatures.labelled.tr_set, 
                   method = "rpart",
                   na.action = na.pass,
                   tuneGrid=tuneGrid,
                   ## This last option is actually one
                   ## for gbm() that passes through
                   metric = "ROC",
                   trControl = control
 )
 varImp(rpartFit1)
 predictions.ts_set = predict(rpartFit1, 
                             newdata = dffeatures.labelled.ts_set,
                             type='prob', na.action = na.pass)
 dffeatures[,"predictions_rpart"] = predict(rpartFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
 # XGB ---------------------------------------------------------------------
 control <- trainControl(method="cv", number=10)
 #classProbs = TRUE
 #tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
 xgbFit <- train(label ~ ., data = dffeatures.labelled.tr_set, 
                   method = "xgbTree",
                   na.action = na.pass,
                   #tuneGrid=tuneGrid,
                   metric = "Accuracy",
                   trControl = control)
 varImp(xgbFit, scale=T)
 as.data.frame(xgbFit$finalModel$params)
 xgbFit$bestTune
 saveRDS(xgbFit, sprintf("xgbtree_maxdepth1_subsample1_eta0.3_%s.rds", Sys.Date()))
 predictions.ts_set = predict(xgbFit, 
                             newdata = dffeatures.labelled.ts_set,
                             type='prob', na.action = na.pass)
 ## Save all predictions  ---------------------------------------------------------
 dffeatures[,"predictions_xgb"] = predict(xgbFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
 write.table(dffeatures[, c(grep('prediction',colnames(dffeatures), value=T),
                           "ViewModifierCodeMeaning", "ViewCodeValue")],
            file = "all_predictions_allmodels_trained_on_train.tab", quote=F, sep='\t')
@@ -0,0 +1,170 @@
 # coding: utf-8
 ############################################################################
 # stratify by BT column: those are 100% sure digital, others can be either
 ############################################################################
 rm(list=ls())
 setwd(dir = "~/repos/mammo/learn_spotmag_from_dicom_headers")
 #cell#
 library(caret)
 library(data.table)
 library(pROC)
 # install.packages(c("pROC"))
 library(ggplot2)
 library(fastmatch)
 read.gz <- function(filename, ...){
  as.data.frame(fread(paste("zcat < ",filename),
                      header=TRUE,  fill = TRUE, ...))
 }
 fn_ids = "../tables/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz"
 ids = read.gz(fn_ids, select="id")$id
 fn_features = "../tables/mammo_dicom_headers/df_all_mammos_dicom_headers_selected_nona.tab.gz"
 dffeatures = read.gz(fn_features, sep='\t')
 # rownames(dffeatures) <- dffeatures$filename
 print(nrow(dffeatures))
 print(length(ids))
 dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
 dffeatures <- dffeatures[!is.na(dffeatures$filename),]
 rm(ids)
 collist = c("BodyPartThickness", "XRayTubeCurrentInuA",  "ContentTime",
            "DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
 for (cc in collist){
  dffeatures[,cc] <- as.numeric(dffeatures[,cc])
 }
 # (head(as.numeric(dffeatures$BodyPartThickness)))
 dtypes = sapply(dffeatures, class)
 row.names(dffeatures) = dffeatures$filename
 excludeCols <- c("filename",
                 "CollimatorLeftVerticalEdge",
                 "CollimatorLowerHorizontalEdge",
                 "DistanceSourceToEntrance",
                 "ExposuresOnDetectorSinceLastCalibration",
                 "ExposuresOnDetectorSinceManufactured",
                 "ShutterLowerHorizontalEdge",     
                 "ShutterRightVerticalEdge",
                 "XRayTubeCurrentInuA"
                 # "ManufacturerModelName"
 )
 dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
 catcols <- c('ViewModifierCodeMeaning',
             'ViewCodeValue',
             'DetectorActiveDimensionsMissing',
             'FieldOfViewOriginMissing',
             'Grid',
             'Manufacturer',
             'ManufacturerModelName')
 for (cc in catcols){
  dffeatures[,cc] = paste0("=", dffeatures[,cc])
  dffeatures[,cc] = as.factor(dffeatures[,cc])
 }
 dffeatures[,"HighBit"] <- as.numeric(dffeatures[,"HighBit"])
 colSums(sapply(dffeatures, is.na))
 # Read labels ---------------------------------
 fn.labelledset = "../tables/spotmag_predictions/train_test_split-2018-02-15-within7e5.csv"
 # filelist.labelled = read.table(fn.labelledset, )
 df.labelled = as.data.frame(fread(fn.labelledset))
 rownames(df.labelled) <- df.labelled$id
 vec.labelled = df.labelled$id
 df.labelled$label <- as.factor(df.labelled$label)
 #cell#
 vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
 vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
 vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
 ############################################################
 dffeatures.labelled <- dffeatures[vec.labelled,]
 dffeatures.labelled$label  <- df.labelled$label
 dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
 dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
 dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
 table(dffeatures.labelled.tr_set$label)
 goodrows <- 1 - colSums(sapply(dffeatures.labelled.tr_set, is.na)) / nrow(dffeatures.labelled.tr_set)
 names(goodrows[goodrows<0.1])
 for (cc in colnames(dffeatures.labelled.tr_set)){
  if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
    setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
    if (length(setdiff_)>0){
      print(cc)
      print(setdiff_)
    }
  }
 }
 # GLMNET ---------------------------------------------------------------------
 library(glmnet)
 # Using glmnet to directly perform CV
 set.seed(0)
 x_train <- model.matrix( ~ .-1, dffeatures.labelled.tr_set[,!(colnames(dffeatures.labelled.tr_set) %in% c("label"))])
 dim(x_train)
 cvob1=cv.glmnet(x=x_train,
                y=dffeatures.labelled.tr_set[,"label"],
                family="binomial",alpha=1, 
                type.measure="auc", nfolds = 5, lambda = seq(0.001,0.1,by = 0.001),
                standardize=FALSE)
 plot(cvob1)
 control <- trainControl(method="cv", number=5, returnResamp="all",
                        classProbs=TRUE, summaryFunction=twoClassSummary)
 #classProbs = TRUE
 tuneGrid <- expand.grid(alpha=c(0.00, 0.25, 0.50, 0.75, 0.99, 1.00), lambda = 10^seq(-5,-2,0.5))
 tune = list()
 fits = list()
 rocs = list()
 for (ii in 1:5){
    glmnetFit <- train(label ~ ., data = dffeatures.labelled.tr_set, 
                       method = "glmnet",
                       na.action = na.pass,
                       tuneGrid=tuneGrid,
                       metric = "ROC",
                       trControl = control)
    fits[[ii]] <- glmnetFit
    tune[[ii]] <- glmnetFit$bestTune
    rocs[[ii]] <- max(glmnetFit$results$ROC)
 }
 tune
 varImp(glmnetFit, scale=T)
 as.data.frame(glmnetFit$bestTune)
 saveRDS(glmnetFit, sprintf("glmnet.rds", Sys.Date()))
 ## Save predictions  ---------------------------------------------------------
 dffeatures[,"predictions_glmnet"] = predict(glmnetFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
 write.table(dffeatures[,c("predictions_glmnet"), drop=F],
            file="all_predictions_glmnet.tab", quote=F, sep='\t')
@@ -0,0 +1,763 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n",
      "/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tabledir = \"../tables/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(772423, 1)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fn = f\"{tabledir}/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz\"\n",
    "df_bt = pd.read_csv(fn, usecols=[\"id\", \"BT_case\"])\n",
    "df_bt.set_index(\"id\", inplace=True)\n",
    "df_bt = ~df_bt.isnull()\n",
    "df_bt.columns = [\"digital\"]\n",
    "df_bt.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>set</th>\n",
       "      <th>label</th>\n",
       "      <th>view</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1013372709_1.2.840.113654.2.70.1.175625299786291545159233542096043464711_3_1</th>\n",
       "      <td>test</td>\n",
       "      <td>normal</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1028995243_1.2.840.113654.2.70.1.56947963181878834591544466761404805157_45576_2</th>\n",
       "      <td>test</td>\n",
       "      <td>normal</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1105112884_1.2.840.113654.2.70.1.178729598744204462442695104630823323474_8905_2</th>\n",
       "      <td>test</td>\n",
       "      <td>normal</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1185125156_1.2.840.113654.2.70.1.45840593750642722243371816041014016032_2_4</th>\n",
       "      <td>test</td>\n",
       "      <td>normal</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1496452586_1.2.840.113654.2.70.1.5582568668770891599992528318631583880_1351_4</th>\n",
       "      <td>test</td>\n",
       "      <td>normal</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     set   label view\n",
       "id                                                                   \n",
       "1013372709_1.2.840.113654.2.70.1.17562529978629...  test  normal    N\n",
       "1028995243_1.2.840.113654.2.70.1.56947963181878...  test  normal    N\n",
       "1105112884_1.2.840.113654.2.70.1.17872959874420...  test  normal    N\n",
       "1185125156_1.2.840.113654.2.70.1.45840593750642...  test  normal    N\n",
       "1496452586_1.2.840.113654.2.70.1.55825686687708...  test  normal    N"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "infile = f\"{tabledir}/spotmag_predictions/train_test_split-2018-02-16-within7e5-label.csv\"\n",
    "dflab = pd.read_csv(infile, index_col='id')\n",
    "dflab[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read header-based predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(772367, 1)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "infile = f\"{tabledir}/spotmag_predictions/all_predictions_glmnet.tab\"\n",
    "dfpred_glmnet = pd.read_table(infile, index_col=0)\n",
    "dfpred_glmnet.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred_glmnet.columns]\n",
    "dfpred_glmnet.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(772367, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>score_gbm</th>\n",
       "      <th>score_xgb</th>\n",
       "      <th>score_rpart</th>\n",
       "      <th>score_xgbt</th>\n",
       "      <th>ViewModifierCodeMeaning</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104556</th>\n",
       "      <td>0.009005</td>\n",
       "      <td>0.020207</td>\n",
       "      <td>0.006882</td>\n",
       "      <td>0.059474</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149405_2104557</th>\n",
       "      <td>0.013337</td>\n",
       "      <td>0.016762</td>\n",
       "      <td>0.006882</td>\n",
       "      <td>0.059660</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141538</th>\n",
       "      <td>0.013337</td>\n",
       "      <td>0.016762</td>\n",
       "      <td>0.006882</td>\n",
       "      <td>0.061051</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2454166001_1.2.840.113654.2.70.1.269947926355209368181920716215505958953_149484_2141537</th>\n",
       "      <td>0.013337</td>\n",
       "      <td>0.016762</td>\n",
       "      <td>0.006882</td>\n",
       "      <td>0.061051</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3337971863_1.2.840.113654.2.70.1.337982194343327746313656933304494759333_1_1</th>\n",
       "      <td>0.031560</td>\n",
       "      <td>0.059142</td>\n",
       "      <td>0.006882</td>\n",
       "      <td>0.157488</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    score_gbm  score_xgb  \\\n",
       "id                                                                         \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.009005   0.020207   \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.013337   0.016762   \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.013337   0.016762   \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...   0.013337   0.016762   \n",
       "3337971863_1.2.840.113654.2.70.1.33798219434332...   0.031560   0.059142   \n",
       "\n",
       "                                                    score_rpart  score_xgbt  \\\n",
       "id                                                                            \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.059474   \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.059660   \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.061051   \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...     0.006882    0.061051   \n",
       "3337971863_1.2.840.113654.2.70.1.33798219434332...     0.006882    0.157488   \n",
       "\n",
       "                                                   ViewModifierCodeMeaning  \n",
       "id                                                                          \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
       "2454166001_1.2.840.113654.2.70.1.26994792635520...                     NaN  \n",
       "3337971863_1.2.840.113654.2.70.1.33798219434332...                     NaN  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "infile = f\"{tabledir}/spotmag_predictions/all_predictions_allmodels_trained_on_train.tab\"\n",
    "dfpred = pd.read_table(infile, index_col=0)\n",
    "dfpred.columns = [cc.replace(\"predictions\", \"score\") for cc in dfpred.columns]\n",
    "dfpred.index.name = 'id'\n",
    "print(dfpred.shape)\n",
    "dfpred[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(772367, 8)\n"
     ]
    }
   ],
   "source": [
    "if 'set' not in dfpred.columns:\n",
    "    dfpred = dfpred.merge(dflab,  left_index=True, right_index=True, how='left')\n",
    "    print(dfpred.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "normal          3526\n",
       "magn/spot        572\n",
       "wire loc          57\n",
       "stereotactic      25\n",
       "other              9\n",
       "Name: view, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "colmap = {\"N\":\"normal\", \"M\": \"magn/spot\",\n",
    "          \"T\":\"stereotactic\", \"W\":\"wire loc\", \"X\":\"other\"}\n",
    "view_counts = dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]).value_counts()\n",
    "view_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>set</th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "      <th>val</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>view</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>magn/spot</th>\n",
       "      <td>380</td>\n",
       "      <td>96</td>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>normal</th>\n",
       "      <td>2310</td>\n",
       "      <td>612</td>\n",
       "      <td>604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>other</th>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>stereotactic</th>\n",
       "      <td>17</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wire loc</th>\n",
       "      <td>37</td>\n",
       "      <td>11</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "set           train  test  val\n",
       "view                          \n",
       "magn/spot       380    96   96\n",
       "normal         2310   612  604\n",
       "other             4     3    2\n",
       "stereotactic     17     4    4\n",
       "wire loc         37    11    9"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.crosstab(dfpred[~dfpred.view.isnull()].view.map(lambda x: colmap[x]), dfpred.set)[[\"train\", \"test\", \"val\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read image-based predictions (general)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "../tables//spotmag_predictions/predictions_images_4189-epoch55-e5ce2d69b035975cb5336cec0da9a32a.csv\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['score_image', 'score_image_max'], dtype='object')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tag = \"e5ce2d69b035975cb5336cec0da9a32a\"\n",
    "epoch = 55\n",
    "infile = f\"{tabledir}/spotmag_predictions/predictions_images_4189-epoch{epoch}-{tag}.csv\"\n",
    "# infile = f\"{tabledir}/spotmag_predictions/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl-spotmag_img_prediction-{tag}.csv\"\n",
    "print(infile)\n",
    "dfpred_img = pd.read_csv(infile, index_col=0)\n",
    "dfpred_img = dfpred_img[['score_image', 'score_image_max']]\n",
    "dfpred_img = dfpred_img.groupby(level=0).mean()\n",
    "dfpred_img.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read image-based predictions (wire localization)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "infile = f\"{tabledir}/spotmag_predictions/predictions_wire_combined_e8e71fc090141d7c6fb334359152d295.csv\"\n",
    "\n",
    "dfpred_imgwire = pd.read_csv(infile, index_col=0)\n",
    "dfpred_imgwire[\"score_wire_max\"] = 1-dfpred_imgwire[[\"scores_0_or\",\"scores_0_fl\"]].min(1)\n",
    "dfpred_imgwire = dfpred_imgwire.drop([\"scores_0_or\",\"scores_0_fl\", \"label\"], axis=1)\n",
    "dfpred_imgwire.columns = [cc.replace(\"scores\", \"score_wire\") for cc in dfpred_imgwire.columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(772367, 13)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "if 'score_image' not in dfpred.columns:\n",
    "    dfpred = pd.concat([dfpred, dfpred_img], axis=1)\n",
    "    dfpred.index.name = 'id'\n",
    "    del dfpred_img\n",
    "    \n",
    "if 'score_glmnet' not in dfpred.columns:\n",
    "    dfpred = pd.concat([dfpred, dfpred_glmnet], axis=1)\n",
    "    dfpred.index.name = 'id'\n",
    "    del dfpred_glmnet\n",
    "    \n",
    "if 'score_wire' not in dfpred.columns:\n",
    "    dfpred = pd.concat([dfpred, dfpred_imgwire], axis=1)\n",
    "    dfpred.index.name = 'id'\n",
    "    del dfpred_imgwire\n",
    "\n",
    "dfpred.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if 'label' not in dfpred.columns:\n",
    "    dfpred = pd.concat([dfpred, dflab], axis=1)\n",
    "if 'digital' not in dfpred.columns:\n",
    "    dfpred = pd.concat([dfpred, df_bt], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>score_image</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>score_wire</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>3584</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>605</td>\n",
       "      <td>768234</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "score_image  False   True \n",
       "score_wire                \n",
       "False         3584       0\n",
       "True           605  768234"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.crosstab(dfpred[\"score_wire\"].isnull(), dfpred[\"score_image\"].isnull())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dfpred.rename(columns={\"score_xgbt\":\"score_gbmt\"}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add ensembled (max, avg) scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dfpred['score_wire'] = dfpred['score_wire'].fillna(0)\n",
    "dfpred['score_wire_max'] = dfpred['score_wire_max'].fillna(0)\n",
    "dfpred['score_image+glmnet'] = (dfpred['score_image'] + dfpred['score_glmnet'])/2\n",
    "dfpred['score_image+gbmt'] = (dfpred['score_image'] + dfpred['score_gbmt'])/2\n",
    "\n",
    "dfpred['score_max(image;gbmt)'] = dfpred[['score_image','score_gbmt']].max(1)\n",
    "\n",
    "dfpred['score_image*glmnet'] = np.sqrt(dfpred['score_image'] * dfpred['score_glmnet'])\n",
    "dfpred['score_image*gbmt'] = np.sqrt(dfpred['score_image'] * dfpred['score_gbmt'])\n",
    "dfpred['score_max_image_wire'] = np.nanmax(dfpred[['score_image','score_wire']].values, axis=1)\n",
    "dfpred['score_max_image_wire_max'] = np.nanmax(dfpred[['score_image','score_wire_max']].values, axis=1)\n",
    "# dfpred['score_wire'].isnull()\n",
    "dfpred['score_max_image_wire+gbmt'] =(dfpred['score_max_image_wire'] + dfpred['score_gbmt'])/2\n",
    "\n",
    "dfpred['score_max_image_wire_max+gbmt'] =(dfpred['score_max_image_wire_max'] + dfpred['score_gbmt'])/2\n",
    "\n",
    "dfpred['score_max(image;wire_max;gbmt)'] = dfpred[['score_wire_max','score_gbmt', 'score_image']].max(1)\n",
    "\n",
    "dfpred['score_max_wire_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire']].values, axis=1)\n",
    "\n",
    "dfpred['score_max_wire_max_image+gbmt'] = np.nanmax(dfpred[['score_image+gbmt','score_wire_max']].values, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dfpred.rename(columns={\"ViewModifierCodeMeaning\":\"ViewModifier\"}, inplace=True)\n",
    "dfpred.index.name = 'id'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the combined table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "772423"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dfpred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dfpred.to_csv(f'{tabledir}/all_predictions_with_images-{tag}.tab', sep='\\t')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
@@ -0,0 +1,98 @@
 # coding: utf-8
 import numpy as np
 import pandas as pd
 import dicom
 from warnings import warn
 def get_tuples(plan, outlist = None, key = ""):
    if len(key)>0:
        key =  key + "_"
    if not outlist:
        outlist = []
    for aa  in plan.dir():
        if (hasattr(plan, aa) and aa!='PixelData'):
            value = getattr(plan, aa)
            if type(value) is dicom.sequence.Sequence:
 #                 if len(list(value))==1:
 #                     outlist.extend(get_tuples(list(value)[0], outlist = None, key = key+aa))
 #                 else:
                for nn, ss in enumerate(list(value)):
                    newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa])
                    outlist.extend(get_tuples(ss, outlist = None, key = newkey))
            else:
                if type(value) is dicom.valuerep.DSfloat:
                    value = float(value)
                elif type(value) is dicom.valuerep.IS:
                    value = str(value)
                elif type(value) is dicom.valuerep.MultiValue:
                    value = tuple(value)
                elif type(value) is dicom.UID.UID:
                    value = str(value)
                outlist.append((key + aa, value))
    return outlist
 def filter_row_common_field(row, common_fields):
    for kk in list(row.keys()):
        if kk not in common_fields:
            row.pop(kk)
    return row
 """
 fn_allheaders = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/dicom_headers_all_fields_filelist_nonscreening_4000_seed42.csv'
 df_allheaders = pd.read_csv(fn_allheaders, index_col=0)
 "at least 5% of rows are there"
 thr = 0.05
 valid_fields = (~df_allheaders.isnull()).mean() > thr
 valid_fields = valid_fields[valid_fields].index.tolist()
 print(len(valid_fields))
 """
 valid_fields = pd.read_table("/data/dlituiev/learn_spotmag_from_dicom_headers/LogisticRegression_common_fields_names.tab", 
                             header=None,
                            squeeze=True).values
 #filelist_fn = '/home/dlituiev/data_dlituiev/tables/df_newest_mammos.pickle'
 filelist_fn = "/home/dlituiev/data_dlituiev/tables/2017-06-mammo_tables/df_original_mammos.pickle"
 filelist = pd.read_pickle(filelist_fn, )["Filename"].unique().tolist()
 len(filelist)
 BUFFER_N_LINES = 100
 SEP = '\t'
 outpath = filelist_fn.replace('.pickle','') + '_dicom_headers_selected.tab'
 final_columns = ['filename'] + list(valid_fields)
 print("len(final_columns)", len(final_columns) )
 print('saving to %s' % outpath)
 with open(outpath, 'w+') as outfh:
    outfh.write(SEP.join(final_columns) + '\n')
    headerlist = []
    for nn, ff in enumerate(filelist):
        if nn% BUFFER_N_LINES == (BUFFER_N_LINES-1):
            df_hl = pd.DataFrame( headerlist, columns=final_columns)
            df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
            outfh.flush()
            del df_hl
            print(nn+1)
            headerlist = []
        try:
            plan = dicom.read_file(ff)
            row = get_tuples(plan)
            row = dict(row)
            row = tuple([ff] + [(row[kk] if (kk in row) else np.nan) for kk in valid_fields ])
            print("len(row)", len(row))
            headerlist.append(row)
        except Exception as ex:
 #             raise ex
            warn('header extraction failed on #\t%s\t%s\t%s' % (nn, ff, ex))
    # in the end, print the rest:
    df_hl = pd.DataFrame( headerlist, columns=final_columns)
    df_hl.to_csv(outfh, sep=SEP, header=None, index=None, mode = 'a')
    outfh.flush()
 print("DONE")
@@ -0,0 +1,798 @@
 # coding: utf-8
 import numpy as np
 import pandas as pd
 import os
 from functools import partial
 from itertools import chain
 def entropy(x):
    f = x.value_counts()
 #     f.loc["nan"] = x.isnull().sum()
    return (f*f.map(np.log2)).sum()
 def select_text_fields(df_allheaders):
    text_fields = df_allheaders.dtypes.map(lambda x: x is pd.np.dtype(object))
    text_fields = text_fields[text_fields].index.tolist()
    len(text_fields)
    text_fields = (~df_allheaders[text_fields].isnull()).mean() > 0.05
    text_fields = text_fields[text_fields].index.tolist()
    remove_list = []
    for tt in text_fields:
        numunique = len(df_allheaders[tt].unique())
        entr = entropy(df_allheaders[tt])
        if entr<1000 | (numunique == 1) | (numunique > 0.75*df_allheaders.shape[1]):
            remove_list.append(tt)
    for tt in remove_list:
        text_fields.remove(tt)
    len(text_fields)
    return text_fields
 def get_good_numeric_fields(df_allheaders, thr_stderr = 1e-6):
    stderr = df_allheaders.std()/df_allheaders.mean()
    field_list = stderr[stderr> thr_stderr].index.tolist()
    return field_list
 def get_index_from_int_tuple(x, ind):
    if type(x) is str:
        x = eval(x)
        return int(float(x[ind]))
    else:
        return x
 def clean_up_field_list(field_list, 
     prefices_remove = ["date", "accession", "number", 
         "Filename",
         "ImageLaterality",
         "GantryID",
         #"0_ViewCodeSequence_CodeMeaning",
         "ViewCodeSequence_CodeMeaning",
         "ViewModifierCodeSequence_CodeValue",
         "EthnicGroup",
         "BodyPartExamined",
         "LossyImageCompression",
         "DeidentificationMethodCodeSequence",
         "UID",
         'EntranceDoseInmGy',
         'ProcedureCodeSequence_CodeMeaning',
         'CommentsOnRadiationDose',
         'DetectorID',
         'SeriesDescription', # potentially informative but too many values
         'SoftwareVersions',
         'PatientAge',
         ],
     fields_remove = [ 'PatientID', 'PatientName', "BitsStored",
         'AcquisitionTime', 
         'AdmittingTime', 
         'ScheduledStudyStartTime',
         'InstanceCreationTime',
         'PerformedProcedureStepStartTime',
         'PregnancyStatus',
         'StudyArrivalTime',
         'StudyCompletionTime',
         'StudyTime',
         'TimeOfLastCalibration',
         'TimeOfLastDetectorCalibration',
         'TimeOfSecondaryCapture',]):
    prefices_remove = [x.lower() for x in prefices_remove]
    for ff in field_list:
        for pp in prefices_remove:
            if pp in ff.lower():
                if ff not in fields_remove:
                    fields_remove.append(ff)
    for ff in fields_remove:
        try:
            field_list.remove(ff)
        except ValueError as ve:
            print(ff, ve)
    return field_list
 def make_lowercase_text_fields(df_allheaders):
    """## make all text fields lowercase 
    (except accession and file name)"""
    for cname in df_allheaders.columns[1:]:
        cc = df_allheaders[cname]
        if cc.dtype is np.dtype(object):
            df_allheaders[cname] = cc.str.lower()
    return df_allheaders
 def format_PixelSpacing(x):
    if type(x) is float:
        return x
    else:
        xstr = x.lstrip("(").rstrip(")").replace("'", "").replace(" ","").split(",")
        return np.unique(tuple([float(y) for y in xstr]))[0]
 def parse_float(x):
    x = str(x).replace("'","").replace("b","").replace("None","nan")
    if x == "":
        x = np.nan
    return x
 def parse_float_tuples(x, to_int=False):
    x = list(str(x))
    for nn,ss in enumerate(x):
        if not ss.isdigit() and ss!='.':
            x[nn] = ';'
    x = "".join(x).split(';')
    if to_int:
        x = tuple([int(float(dd)) for dd in x if len(dd)])
    else:
        x = tuple([float(dd) for dd in x if len(dd)])
    if type(x) is not tuple:
        raise TypeError("returned non-list: {}".format(str(x)))
    return x
 def parse_float_tuples_prod(x):
    if x not in (None, np.nan) and len(x)>0:
        x = str(x)
        assert type(x) is str
        x = parse_float_tuples(x)
        if type(x) is not tuple:
            raise TypeError("returned non-list: {} of type {}".format(str(x), type(x)))
        try:
            x = np.prod(x)
        except TypeError as ee:
            print('"%s"' % x)
            raise ee
    else:
        x = np.nan
    return x
 def parse_int_tuples_median(x):
    x = parse_float_tuples(x)
    x = np.median(x)
    return x
 """
 def parse_float_tuples(x):
    x = eval(x) if type(x) is str else x
    if type(x) in [tuple, list]:
        x = tuple([float(y) for y in x])
    return x
 """
 def parse_str_tuples(x):
    try:
        x = eval(x) if type(x) is str else x
    except:
        x = tuple(x.split(" ")) if type(x) is str else x
    return x
 #############################33
 def extract_list_text_field(df_allheaders, colprefix = "ViewModifierCodeSequence_CodeMeaning"):
    allcols = df_allheaders.columns
    cols = allcols[np.asarray(allcols.map(lambda x: colprefix in x and x!=colprefix), dtype=bool)]
    ViewModifierCodeSequence_CodeMeaning = set()
    for cc in cols:
        ViewModifierCodeSequence_CodeMeaning |= set(df_allheaders[cc].dropna().unique())
    for vv in (True, False):
        if (vv in ViewModifierCodeSequence_CodeMeaning):
            ViewModifierCodeSequence_CodeMeaning.remove(vv)
    ViewModifierCodeSequence_CodeMeaning = dict(zip(
            ViewModifierCodeSequence_CodeMeaning,
           [None]*len(ViewModifierCodeSequence_CodeMeaning)))
    for kk in ViewModifierCodeSequence_CodeMeaning.keys():
        ViewModifierCodeSequence_CodeMeaning[kk] = df_allheaders[cols[0]].copy()
        ViewModifierCodeSequence_CodeMeaning[kk][:] = False
        ViewModifierCodeSequence_CodeMeaning[kk] = \
            ViewModifierCodeSequence_CodeMeaning[kk].astype(bool)
        for cc in cols:
            ViewModifierCodeSequence_CodeMeaning[kk] |= df_allheaders[cc].map(lambda x: kk in x if type(x) is str else False) 
    ViewModifierCodeSequence_CodeMeaning = pd.DataFrame(ViewModifierCodeSequence_CodeMeaning)
    ViewModifierCodeSequence_CodeMeaning.columns = \
        ViewModifierCodeSequence_CodeMeaning.columns.map(lambda x: colprefix + "_" + x.replace(" ",""))
    for cc in cols:
        df_allheaders.drop(cc, axis=1, inplace=True)
    df_allheaders = pd.concat([df_allheaders, ViewModifierCodeSequence_CodeMeaning], axis=1)
    return df_allheaders
 #############################33
 def normalize_fields(df_allheaders):
    # ## Clean up
    # ### PixelSpacing
    if "PatientAge" in df_allheaders.columns:
        df_allheaders.PatientAge = df_allheaders.PatientAge.map(lambda x: int(x.lower().rstrip('y')))
    if "DetectorActiveDimensions" in  df_allheaders.columns:
        df_allheaders.DetectorActiveDimensions = df_allheaders.DetectorActiveDimensions.map(parse_float_tuples_prod)
        #df_allheaders.DetectorActiveDimensions = list(map(parse_float_tuples_prod,
        #                                df_allheaders.DetectorActiveDimensions.tolist()))
    if "PixelSpacing" in  df_allheaders.columns:
        df_allheaders.PixelSpacing = df_allheaders["PixelSpacing"].map(format_PixelSpacing)
    if "ImagerPixelSpacing" in df_allheaders.columns:
        df_allheaders.ImagerPixelSpacing = df_allheaders["ImagerPixelSpacing"].map(format_PixelSpacing)
    if "ModalitiesInStudy" in df_allheaders.columns:
        df_allheaders["ModalitiesInStudy"] = df_allheaders["ModalitiesInStudy"].map(lambda x: "mg" in str(x))
    if "HalfValueLayer" in df_allheaders.columns:
        df_allheaders["HalfValueLayer"] = df_allheaders["HalfValueLayer"].map(lambda x: x if type(x) is float else float(str(x).replace('b','').replace("'", '')))
    # ### FieldOfViewDimensions
    # computing area and filling in the gaps with the mode **worsens** the FNR
    # df_allheaders['FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].map(lambda x: np.prod([int(y) for y in eval(x)]) if type(x) is str else x)
    # df_allheaders.loc[df_allheaders['FieldOfViewDimensions'].isnull(), 'FieldOfViewDimensions'] = df_allheaders['FieldOfViewDimensions'].value_counts().argmax()
    # df_allheaders["PartialView"].map(lambda x: type(x)).value_counts()
    if "ViewPosition" in df_allheaders.columns:
        df_allheaders["ViewPosition"] = df_allheaders["ViewPosition"].map(lambda x: x in ['cc', 'mlo'])
    df_allheaders = extract_list_text_field(df_allheaders, 
        colprefix = "ViewModifierCodeSequence_CodeMeaning")
    #df_allheaders = extract_list_text_field(df_allheaders, 
    #    colprefix = "ViewModifierCodeSequence_CodeMeaning")
    # ### BreastImplantPresent
    # #### clean up
    if "BreastImplantPresent" in df_allheaders.columns:
        # BreastImplantPresent = pd.Series([np.nan]*df_allheaders.shape[0])
        #BreastImplantPresent = pd.Series([False]*df_allheaders.shape[0])
        #BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)] = True
        BreastImplantPresent = df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "yes" in x)
        # BreastImplantPresent[df_allheaders["BreastImplantPresent"].map(str).map(lambda x: "no" in x)] = False
        df_allheaders['BreastImplantPresent'] = BreastImplantPresent
        del BreastImplantPresent
    if "PartialView" in df_allheaders:
        df_allheaders["PartialView"] = df_allheaders["PartialView"].map(lambda x : "yes" in x if type(x) is str else False)
    for kk in ["WindowWidth", "WindowCenter"]:
        if kk in df_allheaders.columns:
            df_allheaders[kk] = df_allheaders[kk].map(parse_int_tuples_median)
    if "PatientOrientation" in df_allheaders.columns:
        df_allheaders.PatientOrientation = df_allheaders.PatientOrientation.map(parse_str_tuples)
    if "DetectorElementPhysicalSize" in df_allheaders.columns:
        df_allheaders["DetectorElementPhysicalSize"] = df_allheaders.DetectorElementPhysicalSize.map(parse_float_tuples)
    # ### Grid
    # df_allheaders["Grid"].value_counts()
    if "Grid" in df_allheaders.columns:
        df_allheaders["Grid"] = (df_allheaders["Grid"]
                             .map(str)
                             .map(lambda x: x.replace('(','')
                                             .replace(')','')
                                             .replace("'","")
                                             .replace(',','')
                                             .replace("parrallel", "parallel")))
        df_allheaders.loc[df_allheaders["Grid"] == "('reciprocating', 'parrallel')", "Grid"] = "('reciprocating', 'parallel')"
        df_allheaders["Grid"].value_counts()
    # df_allheaders.PixelSpacing = df_allheaders.PixelSpacing.astype(str)
    # df_allheaders.PixelSpacing.value_counts()
    if "FieldOfViewOrigin" in df_allheaders.columns:
        df_allheaders["FieldOfViewOrigin_x"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 0))
        df_allheaders["FieldOfViewOrigin_y"] = df_allheaders.FieldOfViewOrigin.map(lambda x : get_index_from_int_tuple(x, 1))
        df_allheaders.drop("FieldOfViewOrigin", axis=1, inplace=True)
    #informative_cols.remove("FieldOfViewOrigin")
    #informative_cols.append("FieldOfViewOrigin_x")
    #informative_cols.append("FieldOfViewOrigin_y")
    if "FocalSpots" in df_allheaders.columns: 
        df_allheaders.loc[df_allheaders["FocalSpots"].isnull(), "FocalSpots"] = df_allheaders["FocalSpots"].value_counts().argmax()
    for kk in ["PixelSpacing", "EstimatedRadiographicMagnificationFactor", "XRayTubeCurrent", "DistanceSourceToPatient"]:
    #    print(kk)
        if kk in df_allheaders.columns:
            df_allheaders.loc[df_allheaders[kk].isnull(), kk] = df_allheaders[kk].median()
    if "ImageType" in df_allheaders.columns:
        keywords = set(chain(*(df_allheaders.ImageType.map(lambda x: parse_str_tuples(x)).tolist())))
        keywords.remove("")
        for kk in  keywords:
            key = "ImageType"+"_"+kk
            df_allheaders[key] = df_allheaders.ImageType.map(lambda x: kk in x)
        df_allheaders.drop("ImageType", axis=1, inplace=True)
    return df_allheaders
 def move_digits_back(allcolumns):
    allcolumns = list(allcolumns)
    for nn, x in enumerate(allcolumns):
        if x[0] in set(list('0123456789')):
            x = "_".join(x.split("_")[1:] + x.split("_")[:1])
            allcolumns[nn] = x
    return allcolumns
 def get_features(df_allheaders, thr_stderr = 1e-6):
    # df_allheaders.columns = move_digits_back(df_allheaders.columns)
    df_allheaders = normalize_fields(df_allheaders.copy())
    text_fields = select_text_fields(df_allheaders)
    # df_allheaders[text_fields].apply(entropy).hist()
    if  thr_stderr >0:
        field_list = get_good_numeric_fields(df_allheaders,thr_stderr=thr_stderr)
    field_list = list(set(clean_up_field_list(field_list + text_fields)))
    df_allheaders = make_lowercase_text_fields(df_allheaders)
    # pd.crosstab(df_allheaders['0_ViewCodeSequence_CodeMeaning'], df_allheaders['ViewPosition'])
    # informative_cols = ['Filename', 'AccessionNumber','BreastImplantPresent','DistanceSourceToPatient','EstimatedRadiographicMagnificationFactor',
    #                  'FocalSpots','Grid','PixelSpacing','XRayTubeCurrent', 'ViewPosition', 'PartialView']
    informative_cols = ['Filename', 'AccessionNumber'] + field_list
    feature_columns = informative_cols[2:]
    noncategorical = ['ContentTime',
                     'FieldOfViewOrigin_x',
                     'FieldOfViewOrigin_y',
                     'HalfValueLayer',
                     'WindowWidth',
                     'CompressionForce',
                    'DetectorActiveDimensions',
                    'RelativeXRayExposure',
                    'ExposureTime',
                    'Exposure',
                    'BodyPartThickness',
                    'FieldOfViewOrigin_y',
                    'CollimatorLowerHorizontalEdge',
                    'WindowCenter',
                    'FieldOfViewRotation',
                    'KVP',
                    'DistanceSourceToDetector',
                    'DistanceSourceToEntrance',
                    'CollimatorLeftVerticalEdge',
                    'DetectorTemperature',
                    'HighBit'] 
    categorical = ['Manufacturer',
                    'ManufacturerModelName',
                    'Grid_htc',
                    'ViewModifierCodeSequence_CodeMeaning',
                    'ViewModifierCodeSequence_CodeMeaning']
    noncategorical = list(set(feature_columns) & set(noncategorical))
    potentially_categorical = (set(feature_columns) - set(noncategorical))
    potentially_categorical |= set(categorical) & set(df_allheaders.columns)
    potentially_categorical = list(potentially_categorical)
    print("potentially_categorical", len(potentially_categorical))
    print("non_categorical", len(noncategorical))
    for cc in noncategorical:
        if str(df_allheaders[cc].dtype) == 'object':
            df_allheaders[cc] = df_allheaders[cc].map(parse_float).astype(float)
    if len(potentially_categorical)>0:
        df_allheaders[potentially_categorical] = df_allheaders[potentially_categorical].fillna('unknown')
        features_onehot = pd.get_dummies(df_allheaders[potentially_categorical], 
                            drop_first=True, prefix_sep='=')
        features_onehot = pd.concat([features_onehot, df_allheaders[noncategorical]], axis=1) 
    else:
        print("no features to binarise!")
        features_onehot = df_allheaders[non_categorical].copy()
    #features_onehot = pd.concat([df_allheaders.Filename, features_onehot],axis=1,).set_index("Filename")
    features_onehot.shape, features_onehot.dropna().shape
    # ### Map DICOM  file name to PNG file name (remove directories)
    #features_onehot.index = features_onehot.index.map(lambda x: "_".join(x.split("/")[-4:]).replace(".dcm", ".png")).tolist()
    for cc in features_onehot.columns[features_onehot.isnull().any()]:
        print("filling in with median:\t%s" % cc)
        features_onehot.loc[features_onehot[cc].isnull(),cc] = \
                features_onehot[cc].median()
    features_onehot = features_onehot.loc[:,~features_onehot.isnull().any()]
    onehotcols = np.asarray(features_onehot.columns[features_onehot.dtypes.map(lambda x : x is pd.np.dtype("uint8"))].tolist())
    thr_frac = 0.01
    bad_feature_cols = onehotcols[(features_onehot[onehotcols].sum(0) < 5) |
                                  (features_onehot[onehotcols].mean(0) < thr_frac) |
                                  (features_onehot[onehotcols].mean(0) > (1-thr_frac))]
    len(bad_feature_cols)
    features_onehot.drop(bad_feature_cols, axis=1, inplace=True)
    if "FocalSpots" in features_onehot:
        features_onehot.loc[features_onehot["FocalSpots"].isnull(), "FocalSpots"] = \
                features_onehot["FocalSpots"].value_counts().argmax()
    return features_onehot
 #############################
 if __name__ == '__main__':
    PREFIX="allfeatures"
    # !sudo pip3 install dicom 
    # # read a table of DICOM headers
    filelist_fn = '/home/dlituiev/data_dlituiev/manuallabeller/filelist/filelist_nonscreening_4000_seed42.csv'
    outpath = os.path.join(os.path.dirname(filelist_fn), "dicom_headers_all_fields_" + os.path.basename(filelist_fn))
    print(outpath)
    df_allheaders = pd.read_csv(outpath, index_col=0)
    features_onehot = get_features(df_allheaders)
    # ## Read labels
    fn_man_labels = "/data/dlituiev/tables/cleaned_manual_labels_valset_4000.txt"
    df = pd.read_table(fn_man_labels, index_col=0)
    df.index = df.index.map(lambda x : x.split("/")[-1])
    # process labels
    df["special_view"] = df["regular_view"].map(lambda x: not x)
    dfm = pd.merge(df[["special_view"]], features_onehot, how='left', left_index=True, right_index=True)
    dfm.shape
    import seaborn as sns
    import matplotlib.pyplot as plt
    from statsmodels.graphics.mosaicplot import mosaic
    plt.matplotlib.rcParams["hatch.color"] = [0.7]*3
    dfm.var()
    dfm.isnull().sum()
    dfm.plot(x='special_view', y='XRayTubeCurrent', kind='scatter', alpha=0.05)
    dfm.plot(x='special_view', y='DistanceSourceToPatient', kind='scatter', alpha=0.05)
    dfm["special_view"].isnull().sum()
    target = dfm["special_view"]
    features = dfm.drop("special_view", axis=1)
    from sklearn.utils import shuffle
    # for building and visualizing the decision tree
    from sklearn.naive_bayes import GaussianNB, BernoulliNB
    # from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
    # visualization
    from vis_tree import visualize_tree
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import (accuracy_score, auc, confusion_matrix, f1_score,
                                 precision_score, roc_curve, precision_recall_curve)
    y_dev,  y_val, X_dev, X_val = train_test_split(target, features, random_state=0, test_size=1/6)
    y_tr,  y_ts, X_tr, X_ts = train_test_split(y_dev, X_dev, random_state=0, test_size=1/5)
    # dtree = DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, criterion="entropy")
    # dtree = RandomForestClassifier(min_samples_split=10, min_samples_leaf=5)
    # dtree = AdaBoostClassifier(base_estimator=dtree, n_estimators=60, learning_rate=0.01)
    # dtree = AdaBoostClassifier(base_estimator=GaussianNB(), n_estimators=50, learning_rate=0.01)
    dtree = GradientBoostingClassifier(max_depth=8, n_estimators=40, learning_rate=0.05, min_samples_leaf=12)
    modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
    dtree.fit(X_tr, y_tr)
    pred_y_ts = dtree.predict(X_ts)
    pred_yscore_ts = dtree.predict_proba(X_ts)
    get_ipython().magic('pinfo auc')
    pr_, rec_, thresholds = precision_recall_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
    # auc_pr = auc(pr_, rec_)
    plt.plot(pr_, rec_)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    # plt.title('auPRC = {0:.2f}%'.format(auc_pr))
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.axis('equal')
    plt.axis('square')
    print("%.2f" % (100*auc_))
    frmt = 'png'
    plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
    fpr_, tpr_, thresholds = roc_curve(y_ts.tolist(), pred_yscore_ts[:,1], pos_label=1)
    auc_ = auc(fnr_, tpr_)
    plt.plot(fpr_, tpr_)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('AUC = {0:.2f}%'.format(auc_))
    plt.axis('equal')
    plt.axis('square')
    print("%.2f" % (100*auc_))
    frmt = 'png'
    plt.savefig("{}_{}_auc.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
    # pd.DataFrame(dict(FNR=fnr_, TPR=tpr_, threshold=thresholds))
    features.plot(x="EstimatedRadiographicMagnificationFactor", y="PixelSpacing", kind='scatter')
    fig,ax = plt.subplots(1, figsize=(6,14))
    feat_imp = pd.Series(dtree.feature_importances_, index=features.columns)
    feat_imp = feat_imp[feat_imp>0.0].sort_values()[::-1]
    feat_imp[::-1].plot(kind='barh', ax=ax)
    print(feat_imp)
    # plt.xlim([0,0.5])
    # plt.tight_layout()
    frmt = 'png'
    plt.savefig("{}_{}_feature_importances.{}".format(PREFIX, modelname, frmt), dpi=300, format=frmt)
    len(thresholds)
    # pd.DataFrame(dict(
    #     FNR=fnr_,
    #     TPR=tpr_,
    #     threshold = thresholds))
    df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
    df_confusion
    confusion_matrix(y_ts, pred_y_ts)
    cm = confusion_matrix(y_ts, pred_y_ts)
    cm[1,0]/cm[1,:].sum()
    def fnr(dtree, X_val, y_val, thr = None):
        if not thr:
            pred_y_val = dtree.predict(X_val)
        else:
            pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
    #     df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
    #                                pd.Series(pred_y_val, name="predicted"))
    #     out = df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
        cm = confusion_matrix(y_val, pred_y_val)
        out = cm[1,0]/cm[1,:].sum()
        return out
    def fpr(dtree, X_val, y_val, thr = None):
        if not thr:
            pred_y_val = dtree.predict(X_val)
        else:
            pred_y_val = dtree.predict_proba(X_val)[:,1] > thr
    #     df_confusion = pd.crosstab(pd.Series(np.asarray(y_val), name="observed"),
    #                                pd.Series(pred_y_val, name="predicted"))
    #     out = df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
        cm = confusion_matrix(y_val, pred_y_val)
        if cm[0,:].sum() !=0:
            out = cm[0,1]/cm[0,:].sum()
        else:
            out = 0.0
        return out
    THR = 0.15
    #          True | False
    #     True   TP |  FN
    #     False  FP |  TN
    # 
    # 
    #     FPR = FP / (FP + TN)
    # 
    pred_y_ts = dtree.predict_proba(X_ts)[:,1] > THR
    df_confusion = pd.crosstab(pd.Series(y_ts.as_matrix(), name="observed"), pd.Series(pred_y_ts, name="predicted"))
    print(df_confusion.to_csv(sep='|'))
    THR = 0.05
    modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
    cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
    cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
    tmpstr = """model: {}
    threshold = {}
    + on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
    + in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
        modelname, THR, 
        100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
        100*cv_fnr.mean(), 100*cv_fpr.mean())
    print(tmpstr)
    THR = 0.5
    modelname = str((dtree).__class__).split(".")[-1].rstrip(""" "'> """).lstrip('"')
    cv_fnr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fnr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
    cv_fpr = cross_val_score(dtree, X_dev, y_dev, groups=None, scoring=partial(fpr, thr=THR), cv=5, n_jobs=1, pre_dispatch='2*n_jobs')
    tmpstr = """model: {}
    threshold = {}
    + on the hold-out set:\tFNR = {:.2f}%, FPR = {:.2f}%
    + in 5-fold cross-validation (mean):\tFNR = {:.2f}%, FPR = {:.2f}%""".format(
        modelname, THR, 
        100*fnr(dtree, X_ts, y_ts, thr = THR), 100*fpr(dtree, X_ts, y_ts, thr = THR),
        100*cv_fnr.mean(), 100*cv_fpr.mean())
    print(tmpstr)
    6/72
    # ## fnr
    # 0.1443 -- AdaBoostClassifier(50, lr=0.1) with:
    # 
    # 
    #     DecisionTreeClassifier(max_depth=7, min_samples_leaf=5, criterion="entropy")
    #     GaussianNB()
    # 
    # 0.1134 -- AdaBoostClassifier(50, lr=0.01) with:
    #     GaussianNB()
    accuracy_score(y_true=y_val, y_pred=pred_y_val)
    f1_score(y_true=y_val, y_pred=pred_y_val)
    confusion_matrix(y_true=y_val, y_pred=pred_y_val)
    df_confusion = pd.crosstab(pd.Series(y_val.as_matrix(), name="observed"), 
                               pd.Series(pred_yscore_dev[:,1]>0.15, name="predicted"))
    df_confusion
    df_confusion[False][True] / (df_confusion[False][True] + df_confusion[True][True])
    df_confusion[True][False] / (df_confusion[False][False] + df_confusion[True][False])
    109/(385+109)
    # ## Misclassified: examples and comments
    # pred_false = (pd.Series(pred_y_val, name="predicted")==False)
    pred_false = (pd.Series(pred_yscore_dev[:,1]<0.15, name="predicted")==False)
    false_negatives = (pd.Series(y_val.as_matrix(), name="observed")) & pred_false
    false_negatives.index=y_val.index
    false_negatives.shape, df.shape
    # y_val[false_negatives.tolist()].shape 
    xstr = """1805162996_1.2.840.113654.2.70.1.75424722723272471565664976911416714890_2_37.png -- implant?
    1433463766_1.2.840.113654.2.70.1.243422935316700791950696878743366703411_6_6.png -- male?
    3395322213_1.2.840.113654.2.70.1.161905211577383187509354224390811944382_1161_7.png -- overexposed with scale grid
    1383662805_1.2.840.113654.2.70.1.194667288082835549565211946781626641146_1_88.png -- mag? bars in the image
    5717508670_1.2.840.113654.2.70.1.135196805563780165444562848954663016070_2_6.png -- spot
    1582554801_1.2.840.113654.2.70.1.202883517655342643705007475928329105895_1_1.png -- strange shape; plate
    3248534628_1.2.840.113654.2.70.1.153327658320065917717726871735320153117_14_8.png -- RLMID, implant
    1050998385_1.2.840.113654.2.70.1.294672228525412928579179278566440354700_168_12.png -- RMLO, underexposed, plate
    2431514667_1.2.840.113654.2.70.1.132697486450403983700631264913146412468_1_1.png -- regular CC
    2836025574_1.2.840.113654.2.70.1.94728406891527814842052605970255602447_31728_4.png  -- regular CC, wire?
    2774547752_1.2.840.113654.2.70.1.152335331945150793610356395498084601027_47428_6.png  -- poor exposure?
    6784971236_1.2.840.113654.2.70.1.276140387730485551768768734852859745761_21705_2.png  -- regular CC
    6120027884_1.2.840.113654.2.70.1.202389441802705593488291262945242015864_28128_3.png -- spot
    2127109953_1.2.840.113654.2.70.1.136443797025605972119376095795980286524_5_26.png  -- RML, scar
    5015120217_1.2.840.113654.2.70.1.8576402180164318136049174781190805706_19615_3.png -- regular MLO, underexposure
    2915273528_1.2.840.113654.2.70.1.50904067248781976561131370015339684052_3_51.png -- RLM
    2859796079_1.2.840.113654.2.70.1.248757700026158935826319533755178408586_3_51.png -- LMLO, scar""".split("\n")
    df_misclassified_comments = pd.DataFrame([x.split(" -- ") for x in xstr], columns=["Filename", "comment"]).applymap(lambda x: x.rstrip().lstrip()).set_index("Filename")["comment"]
    df_misclassified_comments
    df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
    df_misclassified_comments[false_negatives & X_val[false_negatives]['ViewPosition'] & ~X_val[false_negatives]['ViewModifierCodeSequence'] ]
    X_val.columns
    # X_val[false_negatives][['ViewPosition_ccid', 'ViewPosition_lm', 'ViewPosition_lmid',
    #        'ViewPosition_ml', 'ViewPosition_mlo', 'ViewPosition_mloid',
    #        'ViewPosition_xccl', "FieldOfViewDimensions_('145', '105')"]]
    X_val[false_negatives][['ViewPosition', 
                           'ViewModifierCodeSequence']]
@@ -0,0 +1,97 @@
 # coding: utf-8
 #cell#
 import pandas as pd
 import sys
 from header_cleaner import get_features, normalize_fields, parse_float_tuples, parse_float
 #cell#
 fn_features = "../tables/df_all_mammos_dicom_headers_selected.tab.gz"
 outfn = "../tables/df_all_mammos_dicom_headers_selected_norm.tab"
 dffeatures = pd.read_table(fn_features, index_col="filename")
 #cell#
 mask_nonnumeric = ~dffeatures["ContentTime"].map(lambda x: isinstance(x, float) | isinstance(x, int))
 dffeatures.loc[mask_nonnumeric, "ContentTime"] = dffeatures["ContentTime"][mask_nonnumeric].map(lambda x: float(x.replace(':','').replace('--',"30")))
 #cell#
 print("shape", dffeatures.shape)
 #cell#
 normalize_fun = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning":
                lambda x: str(x).lower(),
                "0_ViewCodeSequence_CodeValue": lambda x: str(x),
                "Grid": lambda x: str(x).replace("'","")
                                       .replace("(","").replace(")","")
                                       .replace(",","").replace("/"," ")
                                       .replace('PARRALLEL',"PARALLEL")
                                       .lower(),
                "HighBit": lambda x: str(int(x)) if (isinstance(x, float) and x*1==x) else str(x),
                "WindowCenter": lambda x: np.median(parse_float_tuples(x)),
                "FieldOfViewOrigin":parse_float_tuples,
                "EstimatedRadiographicMagnificationFactor": lambda x: x,
                "ContentTime": lambda x: x,
                "FieldOfViewRotation": lambda x: float(parse_float(x)),
                "KVP": lambda x: float(parse_float(x)),
                 "ShutterLowerHorizontalEdge":  lambda x: float(parse_float(x)),
                 "ShutterRightVerticalEdge":   lambda x: float(parse_float(x)),
                 "XRayTubeCurrentInuA": lambda x: float(parse_float(x)),
                 "RelativeXRayExposure": lambda x: float(parse_float(x)),
                 "ManufacturerModelName": lambda x: str(x).lower().replace('"',''),
                 "Manufacturer": lambda x: str(x).lower().replace('"','').replace(',', '').replace(" inc", "").rstrip('.'),
                 "BodyPartThickness":lambda x: float(parse_float(x)),
                 "CollimatorLeftVerticalEdge": lambda x: float(parse_float(x)),
                 "CollimatorLowerHorizontalEdge": lambda x: float(parse_float(x)),
                 "DetectorActiveDimensions" : lambda x: parse_float_tuples(x.replace("\\", ", ") if isinstance(x, str) else x),
                 "ExposureTime": lambda x: x,
                 "ExposuresOnDetectorSinceLastCalibration": lambda x: x,
                 "ExposuresOnDetectorSinceManufactured": lambda x: x,
                 "DistanceSourceToEntrance":  lambda x: x,
                 "DetectorTemperature":lambda x: float(parse_float(x)),
                 "DistanceSourceToDetector":  lambda x: x,
 }
 dtypes = {"0_ViewCodeSequence__0_ViewModifierCodeSequence_CodeMeaning": str,
                "0_ViewCodeSequence_CodeValue": str,
                "Grid": str,
                "HighBit": str, # int
                "WindowCenter": int,
                "FieldOfViewOrigin": 'O',
                "EstimatedRadiographicMagnificationFactor": float,
                "ContentTime": float, #NaN
                "FieldOfViewRotation": float,
                "KVP": float,
                 "ShutterLowerHorizontalEdge": float,
                 "ShutterRightVerticalEdge": float,
                 "XRayTubeCurrentInuA": float,
                 "RelativeXRayExposure": float,
                 "ManufacturerModelName": str,
                 "Manufacturer": str,
                 "BodyPartThickness": float,
                 "CollimatorLeftVerticalEdge": float,
                 "CollimatorLowerHorizontalEdge": float,
                 "DetectorActiveDimensions" : 'O',
                 "ExposureTime": float,
                 "ExposuresOnDetectorSinceLastCalibration": float, # NaNs
                 "ExposuresOnDetectorSinceManufactured": float, # NaNs
                 "DistanceSourceToEntrance": float,
                 "DetectorTemperature": float,
                 "DistanceSourceToDetector": float,
 }
 #cell#
 set(dffeatures.columns) - set(normalize_fun.keys())
 #cell#
 for kk, vv in dffeatures.items():
    print(kk)
    dffeatures.loc[:,kk] = vv.map(normalize_fun[kk]).astype(dtypes[kk])
 dffeatures.to_csv(outfn, sep='\t',  compression='gzip')
@@ -0,0 +1,48 @@
 ReduceLROnPlateau:
  cooldown: 32
  epsilon: 0.001
  factor: 0.5
  min_lr: 1.0e-08
  mode: auto
  monitor: val_loss
  patience: 32
  verbose: 0
 base_trainable: true
 batch_size: 256
 class_mode: binary
 class_weights: null
 classes:
 - normal
 - special
 contrast: null
 data_augmentation: true
 data_train: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train
 data_val: /data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test
 dropout: 0.5
 fill_mode: reflect
 final_activation: sigmoid
 height_shift_range: 0.125
 horizontal_flip: true
 init_epoch: 0
 loss_weights: null
 lr: 0.0001
 n_classes: 1
 nb_epoch: 500
 ndense: 0
 oversampling: false
 pretrained: true
 rotation_range: 15
 samplewise_center: false
 seed: 2
 target_side: 99
 target_size:
 - 99
 - 99
 truncate_quantile: null
 vertical_flip: false
 weightfile: null
 width_shift_range: 0.125
 zoom_range:
 - 0.8
 - 1.2
 ztransform: false
@@ -0,0 +1 @@
 ../inception_short.py
@@ -0,0 +1,185 @@
 # coding: utf-8
 import sys
 import pandas as pd
 sys.path.append('../..')
 from inception_short import get_model, get_num_files, get_class_weights
 from keras.optimizers import Adam
 from image import ImageDataGenerator
 # from keras.preprocessing.image import ImageDataGenerator
 from keras.models import load_model
 from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
 from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
 from shutil import copy2
 from functools import partial
 class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
 import os
 import yaml
 import numpy as np
 import keras
 from hashlib import md5
 os.environ["PYTHONHASHSEED"]='0'
 os.environ['KERAS_BACKEND'] = 'tensorflow'
 os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
 if os.environ["CUDA_VISIBLE_DEVICES"] == '':
    os.environ["CUDA_VISIBLE_DEVICES"] = '1'
 indir = "./"
 import yaml
 with open(os.path.join(indir, "checkpoint.info")) as chkpt_fh:
    prms = AttrDict(yaml.load(chkpt_fh))
    print("\n".join(["%s\t%s" %(kk,vv) for kk,vv in prms.items()]),)
 weightfile = os.environ["WFILE"]
 #weightfile = "model.175-0.068012.hdf5"
 prms['weightfile'] =  weightfile
 prms['weightfile'] = os.path.join(indir, prms['weightfile'])
 prms['weightfile']
 # In[6]:
 prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
 print("loss:", prms["loss"])
 # CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
 SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
 STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
 print('='*50)
 print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
 print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
 print('='*50)
 #########################################
 if prms.weightfile:
    print("LOADING WEIGHTS FROM:\t%s" % prms.weightfile)
 #     model.load_weights(prms.weightfile)
    model = load_model(prms.weightfile)
 # In[22]:
 flowfromdir_params = dict(
 #     color_mode = "grayscale",
    target_size=prms.target_size,
    batch_size=prms.batch_size,
    class_mode=prms.class_mode,
    classes=prms.classes,
    seed=prms.seed)
 norm_params = dict(
        #rescale=prms.scaleup,
        samplewise_center=prms.samplewise_center,
        samplewise_std_normalization=prms.samplewise_center,
        featurewise_center=False,
        featurewise_std_normalization=False,
        zca_whitening=False,
        )
 # In[23]:
 train_datagen = ImageDataGenerator(**norm_params)
 train_datagen.preprocessing_function = lambda x: x[...,::-1,:]#*2**-8
 datagen_train_output = train_datagen.flow_from_directory(
    prms.data_train,
    #stratify = prms.oversampling,
    #sampling_factor=prms.sampling_factor,
    #oversampling=prms.oversampling,
    shuffle=False, **flowfromdir_params)
 SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
 STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
 ##########################################
 def get_predictions(data_dir, 
                    preprocessing_function = lambda x:x,
                    model=model):
    if isinstance(preprocessing_function, str):
        if preprocessing_function == 'fliplr':
            preprocessing_function = lambda x: x[...,::-1,:]
        elif preprocessing_function in ('identity', 'orig'):
            preprocessing_function = lambda x:x
        else:
            raise ValueError('unknown preprocessing_function:\t%s' 
                             % preprocessing_function)
    val_datagen = ImageDataGenerator(**norm_params)
    val_datagen.preprocessing_function = preprocessing_function
    datagen_val_output = val_datagen.flow_from_directory(
            data_dir,
            shuffle=False, **flowfromdir_params)
    gen_ = datagen_val_output 
    yhat = model.predict_generator(gen_,
                          steps=len(gen_),
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":gen_.filenames, "label": gen_.classes})
    dfres = pd.DataFrame(dfdict)
    return dfres
 ##########################################
 #                HOLDOUT 
 ##########################################
 data_holdout = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_val'
 dfres = get_predictions(
                data_holdout, 
                preprocessing_function = lambda x:x,
                model=model)
 dfres.to_csv("predictions_val.csv", index=False)
 ##########################################
 preprocessing_function = lambda x: x[...,::-1,:]
 dfres = get_predictions(
                data_holdout, 
                preprocessing_function = preprocessing_function,
                model=model)
 dfres.to_csv("predictions_val_fliplr.csv", index=False)
 ##########################################
 #                Test 
 ##########################################
 dfres = get_predictions(
                prms.data_val,
                preprocessing_function = lambda x:x,
                model=model)
 dfres.to_csv("predictions_test.csv", index=False)
 ##########################################
 preprocessing_function = lambda x: x[...,::-1,:]
 dfres = get_predictions(
                prms.data_val,
                preprocessing_function = preprocessing_function,
                model=model)
 dfres.to_csv("predictions_test_fliplr.csv", index=False)
 ##########################################
 #                 TRAIN
 ##########################################
 dfres = get_predictions(
                prms.data_train,
                preprocessing_function = lambda x:x,
                model=model)
 dfres.to_csv("predictions_train.csv", index=False)
 ##########################################
 preprocessing_function = lambda x: x[...,::-1,:]
 dfres = get_predictions(
                prms.data_train,
                preprocessing_function = preprocessing_function,
                model=model)
 dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1,239 @@
 from inception_short import get_model, get_num_files, get_class_weights
 from keras.optimizers import Adam
 from image import ImageDataGenerator
 #from keras.preprocessing.image import ImageDataGenerator
 from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
 from checkpoint_utils import CSVWallClockLogger, lr_cyclic_schedule
 from shutil import copy2
 from functools import partial
 class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
 if __name__ == '__main__':
    import sys
    import os
    import yaml
    import numpy as np
    import keras
    from hashlib import md5
    os.environ["PYTHONHASHSEED"]='0'
    os.environ['KERAS_BACKEND'] = 'tensorflow'
    os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
    os.environ["CUDA_VISIBLE_DEVICES"] = '1'
    prms = AttrDict(
        dropout=0.5,
        base_trainable=True,
        horizontal_flip = True,
        vertical_flip = False,
        zoom_range = [0.8, 1.2],
        rotation_range = 15,
        fill_mode='reflect',
        ndense=0,
        batch_size = 128*2,
        init_epoch=0,
        nb_epoch = 500,
        data_augmentation = True,
        contrast = None, #0.8,
        truncate_quantile = None,#0.001,
        ztransform = False,
        oversampling = False,
        #sampling_factor = None, [1, 6, 16, 64, 4],
        seed=2,
        width_shift_range = 0.125,
        height_shift_range = 0.125,
        class_mode =  'binary', # 'binary', #
        n_classes = 1,
        final_activation = 'sigmoid',
        lr = 1e-4,
        samplewise_center = False, #True
        target_side = 99,
        weightfile = None,
        pretrained = True,
        data_train = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_train',
        data_val = '/data/UCSF_MAMMO/2018-02-png/withx_valset_4000_test',
        classes = ['normal', 'special'],
        class_weights=None,#[1, 1, 4, 8, 4],
        loss_weights = None,
        ReduceLROnPlateau = dict(
            monitor='val_loss',
            factor=1/2,
            patience=32,
            verbose=0,
            mode='auto', epsilon=0.001,
            cooldown=32,
            min_lr=1e-8,
            ),
 #        lr_cyclic_schedule = dict(
 #                #lr_init = 1.0e-3,
 #                drop = 2/5,
 #                epochs_drop = 20,
 #                cycle_len = 200.0
 #            )
        )
    paramhash = md5(str(prms).encode()).hexdigest()
    prms["target_size"] = [ prms.target_side ]*2
    CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
    print("SAVING TO:\t%s" % CHECKPOINT_DIR)
    # copy the script to the checkpoint directory
    copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
    with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
        yaml.dump(dict(prms), outfh, default_flow_style=False)
    prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
    print("loss:", prms["loss"])
    CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
    SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
    STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
    print('='*50)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    #########################################
    checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
            save_best_only=False, save_weights_only=False, mode='auto', period=1)
    csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
    csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
    callback_list = [checkpoint, csv_callback]
    if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
        callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
    elif "lr_cyclic_schedule" in prms:
        callback_list.append(
                LearningRateScheduler(
                    partial(lr_cyclic_schedule,
                        lr_init = prms.lr,
                        **prms.lr_cyclic_schedule)
                                )
                            )
    #########################################
    model = get_model(n_classes=prms.n_classes,
                      final_activation=prms.final_activation,
                      ndense=prms.ndense,
                      dropout=prms.dropout,
                      base_trainable=prms.base_trainable,
                      weights = 'imagenet' if prms.pretrained else None,
                      input_shape = prms.target_size + [3])
    #from keras.utils import plot_model
    #plot_model(model, to_file='model.png')
    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
                  metrics=['accuracy'],
                  )
    #########################################
    if prms.weightfile:
        print("loading weights from:\t%s" % prms.weightfile)
        model.load_weights(prms.weightfile)
    #########################################
    print('Using real-time data augmentation.')
    flowfromdir_params = dict(
        #color_mode = "grayscale",
        target_size=prms.target_size,
        batch_size=prms.batch_size,
        class_mode=prms.class_mode,
        classes=prms.classes,
        seed=prms.seed)
    norm_params = dict(
            #rescale=prms.scaleup,
            samplewise_center=prms.samplewise_center,
            samplewise_std_normalization=prms.samplewise_center,
            featurewise_center=False,
            featurewise_std_normalization=False,
            zca_whitening=False,
            )
    def _ztransform(x):
        return (x-np.mean(x)) / np.std(x)
    if 'preprocessing_function' in prms:
        if prms.preprocessing_function=='ztransform':
            preprocessing_function = _ztransform
        elif prms.preprocessing_function=='m1p1':
            preprocessing_function = lambda x: x/128.0 - 1
        else:
            raise ValueError("unknown preprocessing_function")
    else:
        preprocessing_function = lambda x: x
    if prms.data_augmentation:
        print('Using real-time data augmentation.')
        train_datagen = ImageDataGenerator(
            zoom_range=prms.zoom_range,
            fill_mode=prms.fill_mode,
            rotation_range = prms.rotation_range,
            width_shift_range = prms.width_shift_range,
            height_shift_range = prms.height_shift_range,
            horizontal_flip=prms.horizontal_flip,
            vertical_flip=prms.vertical_flip,
            contrast = prms.contrast,
            z_transform = prms.ztransform,
            truncate_quantile = prms.truncate_quantile,
            #histeq_alpha=prms.histeq_alpha,
            **norm_params)
    else:
        train_datagen = ImageDataGenerator(**norm_params)
    val_datagen = ImageDataGenerator(**norm_params)
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        stratify = prms.oversampling,
        sampling_factor=prms.sampling_factor if (prms.oversampling) else None,
        oversampling=prms.oversampling,
        shuffle=True, **flowfromdir_params)
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
    VALIDATION_STEPS = len(datagen_val_output.filenames)/prms['batch_size']
    print("validation steps", VALIDATION_STEPS)
    #########################################
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    model.fit_generator(datagen_train_output,
                          steps_per_epoch=STEPS_PER_EPOCH,
                          epochs=prms.nb_epoch, verbose=1,
                          validation_data=datagen_val_output,
                          validation_steps=VALIDATION_STEPS,
                          #class_weight='auto',
                          class_weight=class_weights,
                          callbacks=callback_list,
                          initial_epoch=prms.init_epoch)
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    print("""loss\t%.4f
    accuracy\t%.4f\n""" %
      tuple(model.evaluate_generator(datagen_val_output,
                                     steps=VALIDATION_STEPS,
                                     workers=1,
                                    pickle_safe=True)))
    #model.predict()
@@ -0,0 +1,48 @@
 ReduceLROnPlateau:
  cooldown: 8
  epsilon: 0.001
  factor: 0.5
  min_lr: 1.0e-12
  mode: auto
  monitor: val_loss
  patience: 64
  verbose: 0
 base_trainable: false
 batch_size: 16
 class_mode: categorical
 class_weights:
 - 1
 - 1
 classes:
 - normal
 - wire
 data_augmentation: true
 data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
 data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
 dropout: 0.5
 fill_mode: reflect
 final_activation: softmax
 height_shift_range: 0.125
 horizontal_flip: true
 init_epoch: 0
 lr: 0.001
 n_classes: 2
 nb_epoch: 500
 ndense: 0
 oversampling: false
 rescale: 1
 rotation_range: 30
 samplewise_center: false
 seed: 1
 target_side: 299
 target_size:
 - 299
 - 299
 truncate_quantile: null
 vertical_flip: false
 weightfile: null
 width_shift_range: 0.125
 zoom_range:
 - 0.8
 - 1.2
 ztransform: true
@@ -0,0 +1,49 @@
 ReduceLROnPlateau:
  cooldown: 8
  epsilon: 0.001
  factor: 0.5
  min_lr: 1.0e-12
  mode: auto
  monitor: val_loss
  patience: 64
  verbose: 0
 base_trainable: false
 batch_size: 16
 class_mode: categorical
 class_weights:
 - 1
 - 1
 classes:
 - normal
 - wire
 data_augmentation: true
 data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
 data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
 data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
 dropout: 0.5
 fill_mode: reflect
 final_activation: softmax
 height_shift_range: 0.125
 horizontal_flip: true
 init_epoch: 0
 lr: 0.001
 n_classes: 2
 nb_epoch: 500
 ndense: 0
 oversampling: false
 rescale: 1
 rotation_range: 30
 samplewise_center: false
 seed: 2
 target_side: 299
 target_size:
 - 299
 - 299
 truncate_quantile: null
 vertical_flip: false
 weightfile: model.147-0.000774.hdf5
 width_shift_range: 0.125
 zoom_range:
 - 0.8
 - 1.2
 ztransform: true
@@ -0,0 +1,315 @@
 import sys
 import pandas as pd
 sys.path.append('../..')
 from inception_short import get_model, get_num_files, get_class_weights
 from keras.optimizers import Adam
 from image import ImageDataGenerator
 #from keras.preprocessing.image import ImageDataGenerator
 from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 from checkpoint_utils import CSVWallClockLogger
 from shutil import copy2
 from losses import acc_0, acc_1, acc_2, acc_3, acc_4
 class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
 import sys
 import os
 import yaml
 import numpy as np
 import keras
 from hashlib import md5
 os.environ["PYTHONHASHSEED"]='0'
 os.environ['KERAS_BACKEND'] = 'tensorflow'
 os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
 os.environ["CUDA_VISIBLE_DEVICES"]="0"
 prms = AttrDict(
    dropout=0.5,
    base_trainable=False,
    horizontal_flip = True,
    vertical_flip = False,
    zoom_range = [0.8, 1.2],
    rotation_range = 30,
    fill_mode='reflect',
    ndense=0,
    batch_size = 16,
    init_epoch=0,
    nb_epoch = 500,
    data_augmentation = True,
    rescale = 1, #2**-8,
    #contrast = 0.9,
    truncate_quantile = None,#0.001,
    ztransform = True,
    oversampling = False,
    #sampling_factor = [1, 4],
    seed=2,
    width_shift_range = 0.125,
    height_shift_range = 0.125,
    class_mode =  'categorical', # 'binary', #
    n_classes = 2,
    final_activation = "softmax", # 'sigmoid',
    lr = 1e-3,
    samplewise_center = False, #True
    target_side = 299,
    #weights = None,
    weightfile = "model.147-0.000774.hdf5",
    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
    data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
    classes = ["normal", "wire"],
    class_weights=[1, 1],
    ReduceLROnPlateau = dict(
        monitor='val_loss',
        factor=1/2,
        patience=32*2,
        verbose=0,
        mode='auto', epsilon=0.001,
        cooldown=8,
        min_lr=1e-12,
        ),
 )
 paramhash = md5(str(prms).encode()).hexdigest()
 prms["target_size"] = [ prms.target_side ]*2
 CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
 os.makedirs(CHECKPOINT_DIR, exist_ok=True)
 print("SAVING TO:\t%s" % CHECKPOINT_DIR)
 # copy the script to the checkpoint directory
 copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
 with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
    yaml.dump(dict(prms), outfh, default_flow_style=False)
 # w_categorical_crossentropy
 CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
 SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
 STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
 print('='*50)
 print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
 print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
 print('='*50)
 #########################################
 checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
        save_best_only=True, save_weights_only=False, mode='auto', period=1)
 csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
 csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
 prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
 callback_list = [checkpoint, csv_callback]
 if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
 #########################################
 model = get_model(n_classes=prms.n_classes,
                  final_activation=prms.final_activation,
                  ndense=prms.ndense,
                  #weights = prms.weights,
                  dropout=prms.dropout,
                  base_trainable=prms.base_trainable)
 #from keras.utils import plot_model
 #plot_model(model, to_file='model.png')
 if __name__ == '__main__':
    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
                  metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
                      ],
                  )
    #########################################
    if prms.weightfile:
        print("loading weights from:\t%s" % prms.weightfile)
        model.load_weights(prms.weightfile)
    #########################################
    print('Using real-time data augmentation.')
    flowfromdir_params = dict(
        #color_mode = "grayscale",
        target_size=prms.target_size,
        batch_size=prms.batch_size,
        class_mode=prms.class_mode,
        classes=prms.classes,
        seed=prms.seed)
    norm_params = dict(
            rescale=prms.rescale,
            samplewise_center=prms.samplewise_center,
            samplewise_std_normalization=prms.samplewise_center,
            featurewise_center=False,
            featurewise_std_normalization=False,
            zca_whitening=False,
            z_transform = prms.ztransform,
            )
    def _ztransform(x):
        return (x-np.mean(x)) / np.std(x)
    if 'preprocessing_function' in prms:
        if prms.preprocessing_function=='ztransform':
            preprocessing_function = _ztransform
        elif prms.preprocessing_function=='m1p1':
            preprocessing_function = lambda x: x/128.0 - 1
        else:
            raise ValueError("unknown preprocessing_function")
    else:
        preprocessing_function = lambda x: x
    if prms.data_augmentation:
        print('Using real-time data augmentation.')
        train_datagen = ImageDataGenerator(
            zoom_range=prms.zoom_range,
            fill_mode=prms.fill_mode,
            rotation_range = prms.rotation_range,
            width_shift_range = prms.width_shift_range,
            height_shift_range = prms.height_shift_range,
            horizontal_flip=prms.horizontal_flip,
            vertical_flip=prms.vertical_flip,
            contrast = prms.contrast if "contrast" in prms else None,
            truncate_quantile = prms.truncate_quantile,
            #histeq_alpha=prms.histeq_alpha,
            **norm_params)
    else:
        train_datagen = ImageDataGenerator(**norm_params)
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        shuffle=False, **flowfromdir_params)
    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
    ##########################################
    # HOLDOUT
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params)
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_holdout, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_holdout.csv", index=False)
    ##########################################
    # HOLDOUT FLIPPED
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_holdout, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
    #########################################
    # VAL
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_test.csv", index=False)
    #########################################
    # VAL FLIPPED
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_test_fliplr.csv", index=False)
    #########################################
    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
    print('='*50)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    yhat = model.predict_generator(datagen_train_output,
                          steps=STEPS_PER_EPOCH,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
    ##ipdb.set_trace()
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_train.csv", index=False)
    #########################################
    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
    print('='*50)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        shuffle=False, **flowfromdir_params)
    yhat = model.predict_generator(datagen_train_output,
                          steps=STEPS_PER_EPOCH,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
    ##ipdb.set_trace()
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_train_filplr.csv", index=False)
@@ -0,0 +1,50 @@
 ReduceLROnPlateau:
  cooldown: 8
  epsilon: 0.001
  factor: 0.5
  min_lr: 1.0e-12
  mode: auto
  monitor: val_loss
  patience: 64
  verbose: 0
 base_trainable: false
 batch_size: 16
 class_mode: categorical
 class_weights:
 - 1
 - 1
 classes:
 - normal
 - wire
 data_augmentation: true
 data_everything: /media/exx/tron/2017-07-png-jae/
 data_holdout: /data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/
 data_train: /data/UCSF_MAMMO/2018-02-png/each_class_4189_train/
 data_val: /data/UCSF_MAMMO/2018-02-png/each_class_4189_test/
 dropout: 0.5
 fill_mode: reflect
 final_activation: softmax
 height_shift_range: 0.125
 horizontal_flip: true
 init_epoch: 0
 lr: 0.001
 n_classes: 2
 nb_epoch: 500
 ndense: 0
 oversampling: false
 rescale: 1
 rotation_range: 30
 samplewise_center: false
 seed: 2
 target_side: 299
 target_size:
 - 299
 - 299
 truncate_quantile: null
 vertical_flip: false
 weightfile: model.147-0.000774.hdf5
 width_shift_range: 0.125
 zoom_range:
 - 0.8
 - 1.2
 ztransform: true
@@ -0,0 +1,398 @@
 import sys
 import pandas as pd
 sys.path.append('../..')
 sys.path.append("/data/dlituiev/kerastrainutils/")
 from inception_short import get_model, get_num_files, get_class_weights
 from keras.optimizers import Adam
 from _image import ImageDataGenerator
 #from keras.preprocessing.image import ImageDataGenerator
 from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 from checkpoint_utils import CSVWallClockLogger
 from shutil import copy2
 from losses import acc_0, acc_1, acc_2, acc_3, acc_4
 class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
 import sys
 import os
 import yaml
 import numpy as np
 import keras
 from hashlib import md5
 os.environ["PYTHONHASHSEED"]='0'
 os.environ['KERAS_BACKEND'] = 'tensorflow'
 os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
 os.environ["CUDA_VISIBLE_DEVICES"]="3"
 prms = AttrDict(
    dropout=0.5,
    base_trainable=False,
    horizontal_flip = True,
    vertical_flip = False,
    zoom_range = [0.8, 1.2],
    rotation_range = 30,
    fill_mode='reflect',
    ndense=0,
    batch_size = 16,
    init_epoch=0,
    nb_epoch = 500,
    data_augmentation = True,
    rescale = 1, #2**-8,
    #contrast = 0.9,
    truncate_quantile = None,#0.001,
    ztransform = True,
    oversampling = False,
    #sampling_factor = [1, 4],
    seed=2,
    width_shift_range = 0.125,
    height_shift_range = 0.125,
    class_mode =  'categorical', # 'binary', #
    n_classes = 2,
    final_activation = "softmax", # 'sigmoid',
    lr = 1e-3,
    samplewise_center = False, #True
    target_side = 299,
    #weights = None,
    weightfile = "model.147-0.000774.hdf5",
    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
    data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
    data_everything = "/media/exx/tron/2017-07-png-jae/",
    classes = ["normal", "wire"],
    class_weights=[1, 1],
    ReduceLROnPlateau = dict(
        monitor='val_loss',
        factor=1/2,
        patience=32*2,
        verbose=0,
        mode='auto', epsilon=0.001,
        cooldown=8,
        min_lr=1e-12,
        ),
 )
 paramhash = md5(str(prms).encode()).hexdigest()
 prms["target_size"] = [ prms.target_side ]*2
 CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
 os.makedirs(CHECKPOINT_DIR, exist_ok=True)
 print("SAVING TO:\t%s" % CHECKPOINT_DIR)
 # copy the script to the checkpoint directory
 copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
 with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
    yaml.dump(dict(prms), outfh, default_flow_style=False)
 # w_categorical_crossentropy
 CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
 SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
 STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
 print('='*50)
 print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
 print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
 print('='*50)
 #########################################
 checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
        save_best_only=True, save_weights_only=False, mode='auto', period=1)
 csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
 csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
 prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
 callback_list = [checkpoint, csv_callback]
 if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
 #########################################
 model = get_model(n_classes=prms.n_classes,
                  final_activation=prms.final_activation,
                  ndense=prms.ndense,
                  #weights = prms.weights,
                  dropout=prms.dropout,
                  base_trainable=prms.base_trainable)
 #from keras.utils import plot_model
 #plot_model(model, to_file='model.png')
 if __name__ == '__main__':
    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
                  metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
                      ],
                  )
    #########################################
    if prms.weightfile:
        print("loading weights from:\t%s" % prms.weightfile)
        model.load_weights(prms.weightfile)
    #########################################
    print('Using real-time data augmentation.')
    flowfromdir_params = dict(
        #color_mode = "grayscale",
        target_size=prms.target_size,
        batch_size=prms.batch_size,
        class_mode=prms.class_mode,
        classes=prms.classes,
        seed=prms.seed)
    norm_params = dict(
            rescale=prms.rescale,
            samplewise_center=prms.samplewise_center,
            samplewise_std_normalization=prms.samplewise_center,
            featurewise_center=False,
            featurewise_std_normalization=False,
            zca_whitening=False,
            z_transform = prms.ztransform,
            )
    def _ztransform(x):
        return (x-np.mean(x)) / np.std(x)
    if 'preprocessing_function' in prms:
        if prms.preprocessing_function=='ztransform':
            preprocessing_function = _ztransform
        elif prms.preprocessing_function=='m1p1':
            preprocessing_function = lambda x: x/128.0 - 1
        else:
            raise ValueError("unknown preprocessing_function")
    else:
        preprocessing_function = lambda x: x
    if prms.data_augmentation:
        print('Using real-time data augmentation.')
        train_datagen = ImageDataGenerator(
            zoom_range=prms.zoom_range,
            fill_mode=prms.fill_mode,
            rotation_range = prms.rotation_range,
            width_shift_range = prms.width_shift_range,
            height_shift_range = prms.height_shift_range,
            horizontal_flip=prms.horizontal_flip,
            vertical_flip=prms.vertical_flip,
            #contrast = prms.contrast if "contrast" in prms else None,
            #truncate_quantile = prms.truncate_quantile,
            #histeq_alpha=prms.histeq_alpha,
            **norm_params)
    else:
        train_datagen = ImageDataGenerator(**norm_params)
    ##########################################
    # Everything
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params)
    flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    ##########################################
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything_fliplr.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything_flipud.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything_fliplrud.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    # DONE
    ##########################################
    sys.exit(1)
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        shuffle=False, **flowfromdir_params)
    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
    ##########################################
    # HOLDOUT
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params)
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_holdout, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_holdout.csv", index=False)
    ##########################################
    # HOLDOUT FLIPPED
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_holdout, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
    #########################################
    # VAL
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_test.csv", index=False)
    #########################################
    # VAL FLIPPED
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_test_fliplr.csv", index=False)
    #########################################
    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
    print('='*50)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    yhat = model.predict_generator(datagen_train_output,
                          steps=STEPS_PER_EPOCH,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
    ##ipdb.set_trace()
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_train.csv", index=False)
    #########################################
    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
    print('='*50)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        shuffle=False, **flowfromdir_params)
    yhat = model.predict_generator(datagen_train_output,
                          steps=STEPS_PER_EPOCH,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
    ##ipdb.set_trace()
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1 @@
 ../inception_short.py
@@ -0,0 +1,398 @@
 import sys
 import pandas as pd
 sys.path.append('../..')
 sys.path.append("/data/dlituiev/kerastrainutils/")
 from inception_short import get_model, get_num_files, get_class_weights
 from keras.optimizers import Adam
 from _image import ImageDataGenerator
 #from keras.preprocessing.image import ImageDataGenerator
 from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 from checkpoint_utils import CSVWallClockLogger
 from shutil import copy2
 from losses import acc_0, acc_1, acc_2, acc_3, acc_4
 class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
 import sys
 import os
 import yaml
 import numpy as np
 import keras
 from hashlib import md5
 os.environ["PYTHONHASHSEED"]='0'
 os.environ['KERAS_BACKEND'] = 'tensorflow'
 os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
 os.environ["CUDA_VISIBLE_DEVICES"]="3"
 prms = AttrDict(
    dropout=0.5,
    base_trainable=False,
    horizontal_flip = True,
    vertical_flip = False,
    zoom_range = [0.8, 1.2],
    rotation_range = 30,
    fill_mode='reflect',
    ndense=0,
    batch_size = 16,
    init_epoch=0,
    nb_epoch = 500,
    data_augmentation = True,
    rescale = 1, #2**-8,
    #contrast = 0.9,
    truncate_quantile = None,#0.001,
    ztransform = True,
    oversampling = False,
    #sampling_factor = [1, 4],
    seed=2,
    width_shift_range = 0.125,
    height_shift_range = 0.125,
    class_mode =  'categorical', # 'binary', #
    n_classes = 2,
    final_activation = "softmax", # 'sigmoid',
    lr = 1e-3,
    samplewise_center = False, #True
    target_side = 299,
    #weights = None,
    weightfile = "model.147-0.000774.hdf5",
    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
    data_holdout = "/data/UCSF_MAMMO/2018-02-png/each_class_augm_xw_4189_val/",
    data_everything = "/media/exx/tron/2017-07-png-jae/",
    classes = ["normal", "wire"],
    class_weights=[1, 1],
    ReduceLROnPlateau = dict(
        monitor='val_loss',
        factor=1/2,
        patience=32*2,
        verbose=0,
        mode='auto', epsilon=0.001,
        cooldown=8,
        min_lr=1e-12,
        ),
 )
 paramhash = md5(str(prms).encode()).hexdigest()
 prms["target_size"] = [ prms.target_side ]*2
 CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
 os.makedirs(CHECKPOINT_DIR, exist_ok=True)
 print("SAVING TO:\t%s" % CHECKPOINT_DIR)
 # copy the script to the checkpoint directory
 copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
 with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
    yaml.dump(dict(prms), outfh, default_flow_style=False)
 # w_categorical_crossentropy
 CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
 SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
 STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
 print('='*50)
 print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
 print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
 print('='*50)
 #########################################
 checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
        save_best_only=True, save_weights_only=False, mode='auto', period=1)
 csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
 csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
 prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
 callback_list = [checkpoint, csv_callback]
 if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
 #########################################
 model = get_model(n_classes=prms.n_classes,
                  final_activation=prms.final_activation,
                  ndense=prms.ndense,
                  #weights = prms.weights,
                  dropout=prms.dropout,
                  base_trainable=prms.base_trainable)
 #from keras.utils import plot_model
 #plot_model(model, to_file='model.png')
 if __name__ == '__main__':
    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
                  metrics=['accuracy', acc_0, acc_1,# acc_2, acc_3, acc_4
                      ],
                  )
    #########################################
    if prms.weightfile:
        print("loading weights from:\t%s" % prms.weightfile)
        model.load_weights(prms.weightfile)
    #########################################
    print('Using real-time data augmentation.')
    flowfromdir_params = dict(
        #color_mode = "grayscale",
        target_size=prms.target_size,
        batch_size=prms.batch_size,
        class_mode=prms.class_mode,
        classes=prms.classes,
        seed=prms.seed)
    norm_params = dict(
            rescale=prms.rescale,
            samplewise_center=prms.samplewise_center,
            samplewise_std_normalization=prms.samplewise_center,
            featurewise_center=False,
            featurewise_std_normalization=False,
            zca_whitening=False,
            z_transform = prms.ztransform,
            )
    def _ztransform(x):
        return (x-np.mean(x)) / np.std(x)
    if 'preprocessing_function' in prms:
        if prms.preprocessing_function=='ztransform':
            preprocessing_function = _ztransform
        elif prms.preprocessing_function=='m1p1':
            preprocessing_function = lambda x: x/128.0 - 1
        else:
            raise ValueError("unknown preprocessing_function")
    else:
        preprocessing_function = lambda x: x
    if prms.data_augmentation:
        print('Using real-time data augmentation.')
        train_datagen = ImageDataGenerator(
            zoom_range=prms.zoom_range,
            fill_mode=prms.fill_mode,
            rotation_range = prms.rotation_range,
            width_shift_range = prms.width_shift_range,
            height_shift_range = prms.height_shift_range,
            horizontal_flip=prms.horizontal_flip,
            vertical_flip=prms.vertical_flip,
            #contrast = prms.contrast if "contrast" in prms else None,
            #truncate_quantile = prms.truncate_quantile,
            #histeq_alpha=prms.histeq_alpha,
            **norm_params)
    else:
        train_datagen = ImageDataGenerator(**norm_params)
    ##########################################
    # Everything
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params)
    flowfromdir_params['classes'] = [os.path.basename(prms.data_everything.rstrip('/'))]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    ##########################################
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything_fliplr.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:,:]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything_flipud.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    val_datagen.preprocessing_function = lambda x: x[...,::-1,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        os.path.dirname(prms.data_everything.rstrip('/')),
        shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = len(datagen_val_output) 
    pred_fn = "predictions_everything_fliplrud.csv"
    with open(pred_fn, 'w+') as fh:
        print("files", *["scores_%d"%ii for ii in range(2)], sep=',', file=fh)
        for ii, batch in enumerate(datagen_val_output):
            if ii> VALIDATION_STEPS:
                break
            yhat =  model.predict_on_batch(batch[0])
            filenames = datagen_val_output.filenames[ii*prms.batch_size:(ii+1)*prms.batch_size]
            for fnimg, yhat_ in zip(filenames, yhat):
                print(fnimg, *yhat_, sep=',', file = fh)
    ##########################################
    # DONE
    ##########################################
    sys.exit(1)
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        shuffle=False, **flowfromdir_params)
    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
    ##########################################
    # HOLDOUT
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params)
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_holdout, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_holdout.csv", index=False)
    ##########################################
    # HOLDOUT FLIPPED
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_holdout, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_holdout_fliplr.csv", index=False)
    #########################################
    # VAL
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_test.csv", index=False)
    #########################################
    # VAL FLIPPED
    ##########################################
    val_datagen = ImageDataGenerator(**norm_params, )
    val_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    VALIDATION_STEPS = int(np.ceil(len(datagen_val_output.filenames)/prms['batch_size']))
    print("validation steps", VALIDATION_STEPS)
    yhat = model.predict_generator(datagen_val_output,
                          steps=VALIDATION_STEPS,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({ "files":datagen_val_output.filenames, "label": datagen_val_output.classes})
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_test_fliplr.csv", index=False)
    #########################################
    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
    print('='*50)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    yhat = model.predict_generator(datagen_train_output,
                          steps=STEPS_PER_EPOCH,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
    ##ipdb.set_trace()
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_train.csv", index=False)
    #########################################
    SAMPLES_PER_EPOCH = len(datagen_train_output.filenames)
    STEPS_PER_EPOCH = int(np.ceil(SAMPLES_PER_EPOCH / prms.batch_size))
    print('='*50)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    train_datagen.preprocessing_function = lambda x: x[...,::-1,:]
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        shuffle=False, **flowfromdir_params)
    yhat = model.predict_generator(datagen_train_output,
                          steps=STEPS_PER_EPOCH,
                          verbose=1,)
    dfdict = {"scores_%d"%nn : yy for nn, yy in enumerate(yhat.T)}
    dfdict.update({"files":datagen_train_output.filenames, "label": datagen_train_output.classes})
    ##ipdb.set_trace()
    dfres = pd.DataFrame(dfdict)
    dfres.to_csv("predictions_train_fliplr.csv", index=False)
@@ -0,0 +1,224 @@
 from inception_short import get_model, get_num_files, get_class_weights
 from keras.optimizers import Adam
 from image import ImageDataGenerator
 #from keras.preprocessing.image import ImageDataGenerator
 from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 from checkpoint_utils import CSVWallClockLogger
 from shutil import copy2
 class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
 import sys
 import os
 import yaml
 import numpy as np
 import keras
 from hashlib import md5
 os.environ["PYTHONHASHSEED"]='0'
 os.environ['KERAS_BACKEND'] = 'tensorflow'
 os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
 os.environ["CUDA_VISIBLE_DEVICES"]="1"
 prms = AttrDict(
    dropout=0.5,
    base_trainable=False,
    horizontal_flip = True,
    vertical_flip = False,
    zoom_range = [0.8, 1.2],
    rotation_range = 30,
    fill_mode='reflect',
    ndense=0,
    batch_size = 16,
    init_epoch=0,
    nb_epoch = 500,
    data_augmentation = True,
    rescale = 1, #2**-8,
    #contrast = 0.9,
    truncate_quantile = None,#0.001,
    ztransform = True,
    oversampling = False,
    #sampling_factor = [1, 4],
    seed=1,
    width_shift_range = 0.125,
    height_shift_range = 0.125,
    class_mode =  'categorical', # 'binary', #
    n_classes = 2,
    final_activation = "softmax", # 'sigmoid',
    lr = 1e-3,
    samplewise_center = False, #True
    target_side = 299,
    #weights = None,
    weightfile = None, #"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.31-1.290145.hdf5",
    #"checkpoints/6a1a17e4bcaabe458c145fd64dec0322/model.59-1.676424.hdf5",
    data_train = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_train/',
    data_val = '/data/UCSF_MAMMO/2018-02-png/each_class_4189_test/',
    classes = ["normal", "wire"],
    class_weights=[1, 1],
    ReduceLROnPlateau = dict(
        monitor='val_loss',
        factor=1/2,
        patience=32*2,
        verbose=0,
        mode='auto', epsilon=0.001,
        cooldown=8,
        min_lr=1e-12,
        ),
 )
 paramhash = md5(str(prms).encode()).hexdigest()
 prms["target_size"] = [ prms.target_side ]*2
 CHECKPOINT_DIR = "checkpoints/" + paramhash + "/"
 os.makedirs(CHECKPOINT_DIR, exist_ok=True)
 print("SAVING TO:\t%s" % CHECKPOINT_DIR)
 # copy the script to the checkpoint directory
 copy2(os.path.abspath(__file__), CHECKPOINT_DIR)
 with open(os.path.join(CHECKPOINT_DIR, "checkpoint.info"), "w+") as outfh:
    yaml.dump(dict(prms), outfh, default_flow_style=False)
 prms["loss"] = '{}_crossentropy'.format( prms.class_mode )
 CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'model.{epoch:02d}-{val_loss:2f}.hdf5')
 SAMPLES_PER_EPOCH = get_num_files(prms.data_train)
 STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // prms.batch_size
 print('='*50)
 print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
 print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
 print('='*50)
 #########################################
 checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
        save_best_only=True, save_weights_only=False, mode='auto', period=1)
 csv_path = os.path.join(CHECKPOINT_DIR, "progresslog.csv")
 csv_callback = CSVWallClockLogger(csv_path, separator=',', append=False)
 callback_list = [checkpoint, csv_callback]
 if ("ReduceLROnPlateau" in prms) and prms["ReduceLROnPlateau"]:
            callback_list.append(ReduceLROnPlateau(**prms["ReduceLROnPlateau"]))
 #########################################
 model = get_model(n_classes=prms.n_classes,
                  final_activation=prms.final_activation,
                  ndense=prms.ndense,
                  #weights = prms.weights,
                  dropout=prms.dropout,
                  base_trainable=prms.base_trainable)
 #from keras.utils import plot_model
 #plot_model(model, to_file='model.png')
 if __name__ == '__main__':
    model.compile(optimizer=Adam(lr=prms.lr), loss=prms.loss,
                  metrics=['accuracy', #acc_0, acc_1,# acc_2, acc_3, acc_4
                      ],
                  )
    #########################################
    if prms.weightfile:
        print("loading weights from:\t%s" % prms.weightfile)
        model.load_weights(prms.weightfile)
    #########################################
    print('Using real-time data augmentation.')
    flowfromdir_params = dict(
        #color_mode = "grayscale",
        target_size=prms.target_size,
        batch_size=prms.batch_size,
        class_mode=prms.class_mode,
        classes=prms.classes,
        seed=prms.seed)
    norm_params = dict(
            rescale=prms.rescale,
            samplewise_center=prms.samplewise_center,
            samplewise_std_normalization=prms.samplewise_center,
            featurewise_center=False,
            featurewise_std_normalization=False,
            zca_whitening=False,
            z_transform = prms.ztransform,
            )
    def _ztransform(x):
        return (x-np.mean(x)) / np.std(x)
    if 'preprocessing_function' in prms:
        if prms.preprocessing_function=='ztransform':
            preprocessing_function = _ztransform
        elif prms.preprocessing_function=='m1p1':
            preprocessing_function = lambda x: x/128.0 - 1
        else:
            raise ValueError("unknown preprocessing_function")
    else:
        preprocessing_function = lambda x: x
    if prms.data_augmentation:
        print('Using real-time data augmentation.')
        train_datagen = ImageDataGenerator(
            zoom_range=prms.zoom_range,
            fill_mode=prms.fill_mode,
            rotation_range = prms.rotation_range,
            width_shift_range = prms.width_shift_range,
            height_shift_range = prms.height_shift_range,
            horizontal_flip=prms.horizontal_flip,
            vertical_flip=prms.vertical_flip,
            contrast = prms.contrast if "contrast" in prms else None,
            truncate_quantile = prms.truncate_quantile,
            #histeq_alpha=prms.histeq_alpha,
            **norm_params)
    else:
        train_datagen = ImageDataGenerator(**norm_params)
    val_datagen = ImageDataGenerator(**norm_params)
    datagen_train_output = train_datagen.flow_from_directory(
        prms.data_train, 
        stratify = prms.oversampling,
        sampling_factor=prms.sampling_factor if prms.oversampling else None,
        oversampling=prms.oversampling,
        shuffle=True, **flowfromdir_params)
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    #VALIDATION_STEPS = get_num_files(prms.data_val) // prms.batch_size
    VALIDATION_STEPS = np.ceil(len(datagen_val_output.filenames)/prms['batch_size'])
    print("validation steps", VALIDATION_STEPS)
    #########################################
    if prms.class_weights == 'auto':
        class_weights = get_class_weights(datagen_val_output)
    else:
        class_weights = prms.class_weights
    model.fit_generator(datagen_train_output,
                          steps_per_epoch=STEPS_PER_EPOCH,
                          epochs=prms.nb_epoch, verbose=1,
                          validation_data=datagen_val_output,
                          validation_steps=VALIDATION_STEPS,
                          #class_weight='auto',
                          class_weight=class_weights,
                          callbacks=callback_list,
                          initial_epoch=prms.init_epoch)
    datagen_val_output = val_datagen.flow_from_directory(
        prms.data_val, shuffle=False, **flowfromdir_params)
    print("""loss\t%.4f
    accuracy\t%.4f\n""" %
      tuple(model.evaluate_generator(datagen_val_output,
                                     steps=VALIDATION_STEPS,
                                     workers=1,
                                    pickle_safe=True)))
    #model.predict()
@@ -0,0 +1,245 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 """
 Created on Fri Jun  9 11:00:55 2017
@author: dlituiev
 """
 import os
 from collections import Counter
 from functools import partial
 from itertools import product
 import keras
 from keras.applications.inception_v3 import InceptionV3
 from keras.preprocessing import image
 from keras.models import Model
 from keras.layers import Dense, GlobalAveragePooling2D, GaussianNoise, Input
 from keras import backend as K
 from keras.preprocessing.image import ImageDataGenerator
 from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint, EarlyStopping
 from keras.layers import Dense, Dropout, Activation, Flatten, Lambda, BatchNormalization, Input
 from keras.optimizers import Adam
 #########################################
 def get_num_files(parentdir):
    numfiles = 0
    for dd in os.scandir(parentdir):
        dd = os.path.join(parentdir, dd)
        if os.path.isdir(dd):
            numfiles+= sum((1 for ff in os.scandir(dd)))
    return numfiles
 #########################################
 #########################################
 #          SET UP THE NETWORK
 #########################################
 def get_model(n_classes, final_activation,
              ndense=512, dropout=0.5,
              weights='imagenet',
              input_shape = [None, None, 3],
              gaussian_noise_sigma = None,
              input_tensor = None,
              base_trainable=False):
    if input_shape:
        input_tensor = Input(shape = input_shape)
    if gaussian_noise_sigma is not None:
        input_tensor = GaussianNoise(gaussian_noise_sigma)(input_tensor)
    # create the base pre-trained model
    base_model = InceptionV3(weights=weights, include_top=False,
                             input_tensor = input_tensor,
                            )
    # get third Concatenation layer and crop the network on it:
    cc=0
    poptherest = False
    for nn, la in enumerate(base_model.layers):
        if type(la) is keras.layers.Concatenate:
            if cc==3:
                x = la.output
                break
            cc+=1
    base_model.layers = base_model.layers[:nn+1]
    #x = [la.output for la in base_model.layers if type(la) is keras.layers.Concatenate][3]
    x = GlobalAveragePooling2D()(x)
    # let's add a fully-connected layer
    x = Dropout(dropout)(x)
    if ndense>0:
        x = Dense(ndense, activation='relu')(x)
    # and a logistic layer -- let's say we have 200 classes
    predictions = Dense(n_classes, activation=final_activation)(x)
    # this is the model we will train
    model = Model(inputs=base_model.input, outputs=predictions)
    # first: train only the top layers (which were randomly initialized)
    # i.e. freeze all convolutional InceptionV3 layers
    if not base_trainable:
        for layer in base_model.layers:
            layer.trainable = False
    last_module_index = [nn for nn,la  in enumerate(model.layers) if type(la) is keras.layers.Concatenate][-2]
    for layer in model.layers[last_module_index:]:
        layer.trainable = True
    return model
 def get_class_weights(datagen_val_output):
    counter = Counter(datagen_val_output.classes)
    print("distribution of labels in {}:\n{}".format(datagen_val_output.directory, str(counter)))
    for kk,vv in counter.items():
        counter[kk] = vv+1
    max_val = float(max(counter.values()))
    class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}                     
    return class_weights
 def w_categorical_crossentropy(weights):
    def _w_categorical_crossentropy(y_true, y_pred, weights):
        nb_cl = len(weights)
        final_mask = K.zeros_like(y_pred[:, 0])
        y_pred_max = K.max(y_pred, axis=1)
        y_pred_max = K.expand_dims(y_pred_max, 1)
        y_pred_max_mat = K.equal(y_pred, y_pred_max)
        for c_p, c_t in product(range(nb_cl), range(nb_cl)):
            final_mask += (K.cast(weights[c_t, c_p],K.floatx()) *
                           K.cast(y_pred_max_mat[:, c_p] ,K.floatx()) *
                           K.cast(y_true[:, c_t],K.floatx())
                          )
        return K.categorical_crossentropy(y_pred, y_true) * final_mask
    ncce = partial(_w_categorical_crossentropy, weights=weights)
    ncce.__name__ ='w_categorical_crossentropy'
    return ncce
 if __name__ == '__main__':
    import numpy as np
    import keras
    #csv_path = CHECKPOINTS_BASE + ".log.csv"
    #csv_callback = keras.callbacks.CSVLogger(csv_path, separator=',', append=False)
    os.environ['KERAS_BACKEND'] = 'tensorflow'
    os.environ['CUDA_HOME'] = '/usr/local/cuda-8.0'
    os.environ["CUDA_VISIBLE_DEVICES"] = '2'
    NDENSE=256 #512
    BATCH_SIZE = 128
    NB_EPOCH = 20
    DATA_AUGMENTATION = True
    SEED=0
    CLASS_MODE = 'binary' # 'categorical'
    LOSS = '{}_crossentropy'.format(CLASS_MODE)
    N_CLASSES = 1
    FINAL_ACTIVATION = 'sigmoid'
    LR = 0.0001
    SAMPLEWISE_CENTER = False #True
    TARGET_SIDE = 99
    TARGET_SIZE = [TARGET_SIDE]*2
    BASE_TRAINABLE=False
    CHECKPOINT_DIR = "./modelstate_withx_negloglr{:d}_ndense{:d}_imsize{:d}{}/" .format(
                    int(-np.log10(LR)),
                    NDENSE,
                    TARGET_SIDE,
                    "" if not BASE_TRAINABLE else "_base_trainable"
                    )
    CHECKPOINT_PATH = CHECKPOINT_DIR + 'model.{epoch:02d}-{val_loss:2f}.hdf5'
    WEIGHTFILE = None # "./modelstate_withx_negloglr4_ndense256/model.39-0.060567.hdf5" # None # "./modelstate_withx/model.03-0.067136.hdf5"
    # "modelstate_laplace_inv_weights_2/model.10-0.014968.hdf5" #CHECKPOINT_DIR + "model.10-0.019602.hdf5"
    INIT_EPOCH=0
    # indir = "/data/dlituiev/learn_spotmag_from_images/modelstate/"
    # find_min_loss_checkpoint(indir)
    DATA_TRAIN = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_train/'
    DATA_VAL = '/data/UCSF_MAMMO/2017-07-png/withx_valset_4000_test/'
    SAMPLES_PER_EPOCH = get_num_files(DATA_TRAIN)
    STEPS_PER_EPOCH = SAMPLES_PER_EPOCH // BATCH_SIZE
    CLASSES = ["normal", "special"]
    VALIDATION_STEPS = get_num_files(DATA_VAL) // BATCH_SIZE
    print('='*50)
    print("validation steps", VALIDATION_STEPS)
    print("samples per epoch in the train set: %d" % SAMPLES_PER_EPOCH)
    print("steps per epoch in the train set: %d" % STEPS_PER_EPOCH)
    print('='*50)
    #########################################
    os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)
    checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1,
            save_best_only=False, save_weights_only=False, mode='auto', period=1)
    callbacks_list =[checkpoint]
    #########################################
    model = get_model(n_classes=N_CLASSES,
                      final_activation=FINAL_ACTIVATION,
                      ndense=NDENSE,
                      dropout=0.5,
                      base_trainable=BASE_TRAINABLE)
    #from keras.utils import plot_model
    #plot_model(model, to_file='model.png')
    model.compile(optimizer=Adam(lr=LR), loss=LOSS, metrics=['accuracy'],
                  callbacks = [csv_callback])
    #########################################
    if WEIGHTFILE:
        print("loading weights from:\t%s" % WEIGHTFILE)
        model.load_weights(WEIGHTFILE)
    print('Using real-time data augmentation.')
    flowfromdir_params = dict(
        #color_mode = "grayscale",
        target_size=TARGET_SIZE,
        batch_size=BATCH_SIZE,
        class_mode=CLASS_MODE,
        classes=CLASSES,
        seed=SEED)
    train_datagen = ImageDataGenerator(
        samplewise_center=SAMPLEWISE_CENTER,
        samplewise_std_normalization=SAMPLEWISE_CENTER,
        featurewise_center=False,
        featurewise_std_normalization=False,
        zca_whitening=False,
        rotation_range=10,
        width_shift_range=0.125,
        height_shift_range=0.125,
        horizontal_flip=True,
        vertical_flip=False)
    val_datagen = ImageDataGenerator()
    datagen_train_output = train_datagen.flow_from_directory(
        DATA_TRAIN, shuffle=True, **flowfromdir_params)
    datagen_val_output = val_datagen.flow_from_directory(
        DATA_VAL, shuffle=False, **flowfromdir_params)
    class_weights = get_class_weights(datagen_val_output)
    model.fit_generator(datagen_train_output,
                          steps_per_epoch=STEPS_PER_EPOCH,
                          epochs=NB_EPOCH, verbose=1,
                          validation_data=datagen_val_output,
                          validation_steps=VALIDATION_STEPS,
                          #class_weight='auto',
                          class_weight=class_weights,
                          callbacks=callbacks_list,
                          initial_epoch=INIT_EPOCH)
    #model.predict()
@@ -0,0 +1,23 @@
 Cython==0.27.3
 h5py==2.7.0
 imgaug==0.2.5
 Keras==2.0.8
 -e git+https://github.com/raghakot/keras-vis@40b27dfa3ecb84cdde5ec6b44251923c3266cc40#egg=keras_vis
 lime==0.1.1.29
 matplotlib==2.0.2
 mudicom==0.1.2
 numpy==1.14.0
 opencv-python==3.3.0.10
 pandas==0.20.2
 Pillow==4.1.1
 pyaml==17.7.2
 -e git+https://github.com/cocodataset/cocoapi/@727b546dd9fa4e4bb113213c98a3925829fac0bf#egg=pycocotools&subdirectory=PythonAPI
 pydicom==0.9.9
 PyYAML==3.12
 scikit-image==0.13.0
 scikit-learn==0.18.1
 scipy==0.19.1
 seaborn==0.7.1
 sklearn==0.0
 tensorflow-gpu==1.4.1
 tensorflow-tensorboard==0.4.0rc3