Files
mammoviews/caret_on_headers.R
T
Dmytro S Lituiev 0fede818d7 initial
2018-10-12 18:02:34 -07:00

285 lines
9.9 KiB
R

# coding: utf-8
rm(list=ls())
library(caret)
library(gbm3)
library(data.table)
library(ggplot2)
library(fastmatch)
read.gz <- function(filename, ...){
as.data.frame(fread(paste("zcat < ",filename),
header=TRUE, fill = TRUE, ...))
}
TABLEDIR = "../tables/"
fn_ids = paste(TABLEDIR,
"2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz", sep='/')
ids = read.gz(fn_ids, select="id")$id
fn_features = paste(TABLEDIR, "mammo_dicom_headers/df_all_mammos_dicom_headers_selected_expanded.tab.gz", sep='/')
dffeatures = read.gz(fn_features, sep='\t')
print(nrow(dffeatures))
print(length(ids))
dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
dffeatures <- dffeatures[!is.na(dffeatures$filename),]
rm(ids)
# Data formatting -----------------------------------------
collist = c("BodyPartThickness", "XRayTubeCurrentInuA", "ContentTime",
"DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
for (cc in collist){
dffeatures[,cc] <- as.numeric(dffeatures[,cc])
}
dtypes = sapply(dffeatures, class)
names(dtypes[dtypes == 'character'])
row.names(dffeatures) = dffeatures$filename
excludeCols <- c("filename",
"CollimatorLeftVerticalEdge",
"CollimatorLowerHorizontalEdge",
"DistanceSourceToEntrance",
"ExposuresOnDetectorSinceLastCalibration",
"ExposuresOnDetectorSinceManufactured",
"ShutterLowerHorizontalEdge",
"ShutterRightVerticalEdge",
"XRayTubeCurrentInuA"
# "ManufacturerModelName"
)
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
catcols <- c('ViewModifierCodeMeaning',
'ViewCodeValue',
'DetectorActiveDimensionsMissing',
'FieldOfViewOriginMissing',
'Grid',
'Manufacturer',
'ManufacturerModelName')
for (cc in catcols){
dffeatures[,cc] = as.factor(dffeatures[,cc])
}
#cell#
colSums(sapply(dffeatures, is.na))
# Read labels --------------------------------
fn.labelledset = paste(TABLEDIR, "spotmag_predictions/train_test_split-2018-02-15-within7e5.csv", sep='/')
# filelist.labelled = read.table(fn.labelledset, )
df.labelled = as.data.frame(fread(fn.labelledset))
rownames(df.labelled) <- df.labelled$id
vec.labelled = df.labelled$id
df.labelled$label <- as.factor(df.labelled$label)
#cell#
vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
############################################################
dffeatures.labelled <- dffeatures[vec.labelled,]
dffeatures.labelled$label <- df.labelled$label
#cell#
dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
colnames(dffeatures.labelled.tr_set)
for (cc in colnames(dffeatures.labelled.tr_set)){
if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
if (length(setdiff_)>0){
print(cc)
print(setdiff_)
}
}
}
# GBM3 ----------------------------------------
par_detail <- gbmParallel(num_threads = 4) # Pass to par_details in gbmt
gbmt_fit <- gbmt(label ~ .,
data = dffeatures.labelled.tr_set,
cv_folds = 10,
# training_params = training_params(num_trees = 100,
# interaction_depth = 1,
# min_num_obs_in_node = 10,
# shrinkage = 0.005,
# bag_fraction = 0.5,
# num_features = 2),
keep_gbm_data = TRUE,
par_detail=par_detail)
best_iter_cv <- gbmt_performance(gbmt_fit, method='cv')
plot(best_iter_cv)
best.iter.oob <- gbmt_performance(gbmt_fit,method="OOB") # returns out-of-bag estimated best number of trees
plot(best.iter.oob)
saveRDS(gbmt_fit, sprintf("gbm3_ntrees_%d_%s.rds", best_iter_cv, Sys.Date()))
## Feature Importance Plotting ----------------
infl_gbmt <- (as.data.frame(relative_influence(gbmt_fit, best_iter_cv, rescale=T)))
colnames(infl_gbmt) <- "relative influence"
infl_gbmt[,"variable"] <- rownames(infl_gbmt)
infl_gbmt = infl_gbmt[infl_gbmt$`relative influence` >0,]
plimp <- ggplot(data=infl_gbmt) +
geom_segment(size=5, colour='blue') +
aes(x=reorder(variable,`relative influence`),
xend = variable,
y = 2e-6,
yend=`relative influence`,
label=`relative influence`) +
scale_y_log10() +
# coord_cartesian(ylim= c(0.8e-6, 1.05)) +
ylab("relative influence") + xlab("") +
coord_flip() +
theme(axis.text.y = element_text(colour="black",size=16,angle=0,face="plain"),
axis.text.x = element_text(colour="black",size=16,angle=0,face="plain"),
axis.title.x = element_text(colour="black",size=16,angle=0,face="plain"),
# panel.background = element_rect(fill = "transparent"), # bg of the panel
#plot.background = element_rect(fill = "transparent"), # bg of the plot
# panel.grid.major = element_blank(), # get rid of major grid
# , panel.grid.minor = element_blank() # get rid of minor grid
, legend.background = element_rect(fill = "transparent") # get rid of legend bg
, legend.box.background = element_rect(fill = "transparent") # get rid of legend panel bg
)
plimp + coord_trans(limy= c(0.5e-6, 1.05)) + coord_flip()
plimp + ggsave("img/xgbt_importances.eps", device = 'eps', bg = "transparent",
width = 8, height = 6, dpi = 300, units = "in" )
plimp + ggsave("img/xgbt_importances.png", device = 'png', bg = "transparent",
width = 8, height = 6, dpi = 300, units = "in" )
dffeatures[,"predictions_gbmt"] = predict(gbmt_fit, newdata = dffeatures,
n.trees = best_iter_cv,
type = "response", na.action = na.pass)
# GBM-CARET ---------------------------------------------------
control <- trainControl(method = "cv",
number = 10,
p =.8,
savePredictions = TRUE,
classProbs = TRUE,
summaryFunction = twoClassSummary)
tuneGrid <- expand.grid(n.trees = c(80,100,120,140,160),
shrinkage=c(0.025, 0.05, 0.1, 0.2),
interaction.depth = c(1,2),
n.minobsinnode = c(10, 15))
gbmFit1 <- train(label ~ .,
data = dffeatures.labelled.tr_set,
method = "gbm",
na.action = na.pass,
tuneGrid=tuneGrid,
## This last option is actually one
## for gbm() that passes through
metric = "ROC",
trControl = control,
# importance = TRUE,
verbose = FALSE)
gbmFit1
## Feature Importance Plotting ---------------------------------------------
gbmsmmry <- summary(gbmFit1, normalize=T, plotit=F)
gbmsmmry <- gbmsmmry[gbmsmmry$rel.inf>0,]
ggplot(data=gbmsmmry) +
geom_segment(size=3, colour='red') +
aes(x=reorder(var,rel.inf, sum),
xend = var,
y = 0.002,
yend=(rel.inf),
label=rel.inf) +
scale_y_log10() +
ylab("relative influence") + xlab("") +
coord_flip()
saveRDS(gbmFit1, "gbm_ntrees80_interactiondepth2_shrinkage0.2_nminobsinnode15_trainset_2018-02-18.rds")
dffeatures[,"predictions_gbm"] = predict(gbmFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
# RPART -----------------------------------------------------------------
tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
rpartFit1 <- train(label ~ ., data = dffeatures.labelled.tr_set,
method = "rpart",
na.action = na.pass,
tuneGrid=tuneGrid,
## This last option is actually one
## for gbm() that passes through
metric = "ROC",
trControl = control
)
varImp(rpartFit1)
predictions.ts_set = predict(rpartFit1,
newdata = dffeatures.labelled.ts_set,
type='prob', na.action = na.pass)
dffeatures[,"predictions_rpart"] = predict(rpartFit1, newdata = dffeatures, type = "prob", na.action = na.pass)$special
# XGB ---------------------------------------------------------------------
control <- trainControl(method="cv", number=10)
#classProbs = TRUE
#tuneGrid <- expand.grid(cp=c(0.0, 0.0125, 0.025, 0.05, 0.1, 0.2))
xgbFit <- train(label ~ ., data = dffeatures.labelled.tr_set,
method = "xgbTree",
na.action = na.pass,
#tuneGrid=tuneGrid,
metric = "Accuracy",
trControl = control)
varImp(xgbFit, scale=T)
as.data.frame(xgbFit$finalModel$params)
xgbFit$bestTune
saveRDS(xgbFit, sprintf("xgbtree_maxdepth1_subsample1_eta0.3_%s.rds", Sys.Date()))
predictions.ts_set = predict(xgbFit,
newdata = dffeatures.labelled.ts_set,
type='prob', na.action = na.pass)
## Save all predictions ---------------------------------------------------------
dffeatures[,"predictions_xgb"] = predict(xgbFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
write.table(dffeatures[, c(grep('prediction',colnames(dffeatures), value=T),
"ViewModifierCodeMeaning", "ViewCodeValue")],
file = "all_predictions_allmodels_trained_on_train.tab", quote=F, sep='\t')