mirror of
https://github.com/wassname/mammoviews.git
synced 2026-06-27 16:10:25 +08:00
171 lines
5.5 KiB
R
171 lines
5.5 KiB
R
# coding: utf-8
|
|
############################################################################
|
|
# stratify by BT column: those are 100% sure digital, others can be either
|
|
############################################################################
|
|
rm(list=ls())
|
|
setwd(dir = "~/repos/mammo/learn_spotmag_from_dicom_headers")
|
|
#cell#
|
|
library(caret)
|
|
library(data.table)
|
|
|
|
library(pROC)
|
|
# install.packages(c("pROC"))
|
|
library(ggplot2)
|
|
library(fastmatch)
|
|
|
|
read.gz <- function(filename, ...){
|
|
as.data.frame(fread(paste("zcat < ",filename),
|
|
header=TRUE, fill = TRUE, ...))
|
|
}
|
|
|
|
|
|
fn_ids = "../tables/2017-06-mammo_tables/df_dcm_reports_birads_path_indic_dens_birad_wi_year_noreport_nodupl.csv.gz"
|
|
ids = read.gz(fn_ids, select="id")$id
|
|
|
|
fn_features = "../tables/mammo_dicom_headers/df_all_mammos_dicom_headers_selected_nona.tab.gz"
|
|
dffeatures = read.gz(fn_features, sep='\t')
|
|
|
|
# rownames(dffeatures) <- dffeatures$filename
|
|
print(nrow(dffeatures))
|
|
print(length(ids))
|
|
|
|
dffeatures <- dffeatures[fmatch(unique(ids), dffeatures$filename),]
|
|
dffeatures <- dffeatures[!is.na(dffeatures$filename),]
|
|
|
|
rm(ids)
|
|
|
|
collist = c("BodyPartThickness", "XRayTubeCurrentInuA", "ContentTime",
|
|
"DetectorTemperature", "WindowCenter", "FieldOfViewRotation")
|
|
for (cc in collist){
|
|
dffeatures[,cc] <- as.numeric(dffeatures[,cc])
|
|
}
|
|
|
|
|
|
|
|
# (head(as.numeric(dffeatures$BodyPartThickness)))
|
|
dtypes = sapply(dffeatures, class)
|
|
|
|
row.names(dffeatures) = dffeatures$filename
|
|
excludeCols <- c("filename",
|
|
"CollimatorLeftVerticalEdge",
|
|
"CollimatorLowerHorizontalEdge",
|
|
"DistanceSourceToEntrance",
|
|
"ExposuresOnDetectorSinceLastCalibration",
|
|
"ExposuresOnDetectorSinceManufactured",
|
|
"ShutterLowerHorizontalEdge",
|
|
"ShutterRightVerticalEdge",
|
|
"XRayTubeCurrentInuA"
|
|
# "ManufacturerModelName"
|
|
)
|
|
dffeatures <- (dffeatures[, !(colnames(dffeatures) %in% excludeCols)])
|
|
|
|
|
|
catcols <- c('ViewModifierCodeMeaning',
|
|
'ViewCodeValue',
|
|
'DetectorActiveDimensionsMissing',
|
|
'FieldOfViewOriginMissing',
|
|
'Grid',
|
|
'Manufacturer',
|
|
'ManufacturerModelName')
|
|
|
|
for (cc in catcols){
|
|
dffeatures[,cc] = paste0("=", dffeatures[,cc])
|
|
dffeatures[,cc] = as.factor(dffeatures[,cc])
|
|
}
|
|
|
|
dffeatures[,"HighBit"] <- as.numeric(dffeatures[,"HighBit"])
|
|
|
|
colSums(sapply(dffeatures, is.na))
|
|
|
|
# Read labels ---------------------------------
|
|
|
|
fn.labelledset = "../tables/spotmag_predictions/train_test_split-2018-02-15-within7e5.csv"
|
|
# filelist.labelled = read.table(fn.labelledset, )
|
|
df.labelled = as.data.frame(fread(fn.labelledset))
|
|
rownames(df.labelled) <- df.labelled$id
|
|
vec.labelled = df.labelled$id
|
|
df.labelled$label <- as.factor(df.labelled$label)
|
|
|
|
#cell#
|
|
|
|
vec.labelled.valset = rownames(df.labelled[df.labelled$set == 'val',])
|
|
vec.labelled.tr_set = rownames(df.labelled[df.labelled$set == 'train',])
|
|
vec.labelled.ts_set = rownames(df.labelled[df.labelled$set == 'test',])
|
|
############################################################
|
|
dffeatures.labelled <- dffeatures[vec.labelled,]
|
|
dffeatures.labelled$label <- df.labelled$label
|
|
|
|
dffeatures.labelled.devset <- dffeatures.labelled[!(rownames(dffeatures.labelled) %in% vec.labelled.valset),]
|
|
dffeatures.labelled.tr_set <- dffeatures.labelled[vec.labelled.tr_set,]
|
|
dffeatures.labelled.ts_set <- dffeatures.labelled[vec.labelled.ts_set,]
|
|
|
|
table(dffeatures.labelled.tr_set$label)
|
|
|
|
|
|
goodrows <- 1 - colSums(sapply(dffeatures.labelled.tr_set, is.na)) / nrow(dffeatures.labelled.tr_set)
|
|
|
|
names(goodrows[goodrows<0.1])
|
|
|
|
|
|
for (cc in colnames(dffeatures.labelled.tr_set)){
|
|
if (is.factor(dffeatures.labelled.tr_set[,cc]) ){
|
|
setdiff_ = setdiff(dffeatures.labelled.ts_set[,cc], dffeatures.labelled.tr_set[,cc])
|
|
if (length(setdiff_)>0){
|
|
print(cc)
|
|
print(setdiff_)
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
# GLMNET ---------------------------------------------------------------------
|
|
|
|
library(glmnet)
|
|
# Using glmnet to directly perform CV
|
|
set.seed(0)
|
|
|
|
x_train <- model.matrix( ~ .-1, dffeatures.labelled.tr_set[,!(colnames(dffeatures.labelled.tr_set) %in% c("label"))])
|
|
dim(x_train)
|
|
|
|
cvob1=cv.glmnet(x=x_train,
|
|
y=dffeatures.labelled.tr_set[,"label"],
|
|
family="binomial",alpha=1,
|
|
type.measure="auc", nfolds = 5, lambda = seq(0.001,0.1,by = 0.001),
|
|
standardize=FALSE)
|
|
plot(cvob1)
|
|
|
|
control <- trainControl(method="cv", number=5, returnResamp="all",
|
|
classProbs=TRUE, summaryFunction=twoClassSummary)
|
|
#classProbs = TRUE
|
|
|
|
tuneGrid <- expand.grid(alpha=c(0.00, 0.25, 0.50, 0.75, 0.99, 1.00), lambda = 10^seq(-5,-2,0.5))
|
|
tune = list()
|
|
fits = list()
|
|
rocs = list()
|
|
for (ii in 1:5){
|
|
glmnetFit <- train(label ~ ., data = dffeatures.labelled.tr_set,
|
|
method = "glmnet",
|
|
na.action = na.pass,
|
|
tuneGrid=tuneGrid,
|
|
metric = "ROC",
|
|
trControl = control)
|
|
fits[[ii]] <- glmnetFit
|
|
tune[[ii]] <- glmnetFit$bestTune
|
|
rocs[[ii]] <- max(glmnetFit$results$ROC)
|
|
}
|
|
|
|
tune
|
|
|
|
varImp(glmnetFit, scale=T)
|
|
as.data.frame(glmnetFit$bestTune)
|
|
|
|
saveRDS(glmnetFit, sprintf("glmnet.rds", Sys.Date()))
|
|
|
|
## Save predictions ---------------------------------------------------------
|
|
|
|
dffeatures[,"predictions_glmnet"] = predict(glmnetFit, newdata = dffeatures, type = "prob", na.action = na.pass)$special
|
|
|
|
write.table(dffeatures[,c("predictions_glmnet"), drop=F],
|
|
file="all_predictions_glmnet.tab", quote=F, sep='\t')
|
|
|