Files
mammoviews/caret_on_headers_nona.R
Dmytro S Lituiev 0fede818d7 initial
2018-10-12 18:02:34 -07:00

171 lines
5.5 KiB
R

# coding: utf-8
############################################################################
# stratify by BT column: those are 100% sure digital, others can be either
############################################################################
rm(list=ls())
setwd(dir = "~/repos/mammo/learn_spotmag_from_dicom_headers")
#cell#
library(caret)
library(data.table)
library(pROC)
# install.packages(c("pROC"))
library(ggplot2)
library(fastmatch)
read.gz <- function(filename, ...){
as.data.frame(fread(paste("zcat < ",filename),
header=TRUE, fill = TRUE, ...))
}
fn_ids = "../tables/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz"
ids = read.gz(fn_ids, select="id")$id
fn_features = "../tables/mammo_dicom_headers/df_all_mammos_dicom_headers_selected_nona.tab.gz"
dffeatures = read.gz(fn_features, sep='\t')
# rownames(dffeatures) <- dffeatures$filename
print(nrow(dffeatures))
print(length(ids))
dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
dffeatures <- dffeatures[!is.na(dffeatures$filename),]
rm(ids)
collist = c("BodyPartThickness", "XRayTubeCurrentInuA", "ContentTime",
"DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
for (cc in collist){
dffeatures[,cc] <- as.numeric(dffeatures[,cc])
}
# (head(as.numeric(dffeatures$BodyPartThickness)))
dtypes = sapply(dffeatures, class)
row.names(dffeatures) = dffeatures$filename
excludeCols <- c("filename",
"CollimatorLeftVerticalEdge",
"CollimatorLowerHorizontalEdge",
"DistanceSourceToEntrance",
"ExposuresOnDetectorSinceLastCalibration",
"ExposuresOnDetectorSinceManufactured",
"ShutterLowerHorizontalEdge",
"ShutterRightVerticalEdge",
"XRayTubeCurrentInuA"
# "ManufacturerModelName"
)
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
catcols <- c('ViewModifierCodeMeaning',
'ViewCodeValue',
'DetectorActiveDimensionsMissing',
'FieldOfViewOriginMissing',
'Grid',
'Manufacturer',
'ManufacturerModelName')
for (cc in catcols){
dffeatures[,cc] = paste0("=", dffeatures[,cc])
dffeatures[,cc] = as.factor(dffeatures[,cc])
}
dffeatures[,"HighBit"] <- as.numeric(dffeatures[,"HighBit"])
colSums(sapply(dffeatures, is.na))
# Read labels ---------------------------------
fn.labelledset = "../tables/spotmag_predictions/train_test_split-2018-02-15-within7e5.csv"
# filelist.labelled = read.table(fn.labelledset, )
df.labelled = as.data.frame(fread(fn.labelledset))
rownames(df.labelled) <- df.labelled$id
vec.labelled = df.labelled$id
df.labelled$label <- as.factor(df.labelled$label)
#cell#
vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
############################################################
dffeatures.labelled <- dffeatures[vec.labelled,]
dffeatures.labelled$label <- df.labelled$label
dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
table(dffeatures.labelled.tr_set$label)
goodrows <- 1 - colSums(sapply(dffeatures.labelled.tr_set, is.na)) / nrow(dffeatures.labelled.tr_set)
names(goodrows[goodrows<0.1])
for (cc in colnames(dffeatures.labelled.tr_set)){
if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
if (length(setdiff_)>0){
print(cc)
print(setdiff_)
}
}
}
# GLMNET ---------------------------------------------------------------------
library(glmnet)
# Using glmnet to directly perform CV
set.seed(0)
x_train <- model.matrix( ~ .-1, dffeatures.labelled.tr_set[,!(colnames(dffeatures.labelled.tr_set) %in% c("label"))])
dim(x_train)
cvob1=cv.glmnet(x=x_train,
y=dffeatures.labelled.tr_set[,"label"],
family="binomial",alpha=1,
type.measure="auc", nfolds = 5, lambda = seq(0.001,0.1,by = 0.001),
standardize=FALSE)
plot(cvob1)
control <- trainControl(method="cv", number=5, returnResamp="all",
classProbs=TRUE, summaryFunction=twoClassSummary)
#classProbs = TRUE
tuneGrid <- expand.grid(alpha=c(0.00, 0.25, 0.50, 0.75, 0.99, 1.00), lambda = 10^seq(-5,-2,0.5))
tune = list()
fits = list()
rocs = list()
for (ii in 1:5){
glmnetFit <- train(label ~ ., data = dffeatures.labelled.tr_set,
method = "glmnet",
na.action = na.pass,
tuneGrid=tuneGrid,
metric = "ROC",
trControl = control)
fits[[ii]] <- glmnetFit
tune[[ii]] <- glmnetFit$bestTune
rocs[[ii]] <- max(glmnetFit$results$ROC)
}
tune
varImp(glmnetFit, scale=T)
as.data.frame(glmnetFit$bestTune)
saveRDS(glmnetFit, sprintf("glmnet.rds", Sys.Date()))
## Save predictions ---------------------------------------------------------
dffeatures[,"predictions_glmnet"] = predict(glmnetFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
write.table(dffeatures[,c("predictions_glmnet"), drop=F],
file="all_predictions_glmnet.tab", quote=F, sep='\t')