summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorMike Vink <mike1994vink@gmail.com>2021-05-02 17:33:26 +0200
committerMike Vink <mike1994vink@gmail.com>2021-05-02 17:33:26 +0200
commitbf1adece8aeb48e136085233d2f5ff2f9600eaf5 (patch)
tree6a46b0c7e7fbea6a85c0e44714e0076251e82cac /scripts
parentde4565fe9290ec1f1031eed6f7d067794df53166 (diff)
update
Diffstat (limited to 'scripts')
-rw-r--r--scripts/data_prep.R128
-rw-r--r--scripts/modelling.R39
-rw-r--r--scripts/modelling_eval.R175
-rw-r--r--scripts/modelling_exploration.R443
-rw-r--r--scripts/modelling_results.RDatabin0 -> 19268467 bytes
-rw-r--r--scripts/modelling_results_withrrlda.RDatabin0 -> 1876197 bytes
-rw-r--r--scripts/sam.R49
7 files changed, 821 insertions, 13 deletions
diff --git a/scripts/data_prep.R b/scripts/data_prep.R
index acd9977..30c4d85 100644
--- a/scripts/data_prep.R
+++ b/scripts/data_prep.R
@@ -1,31 +1,133 @@
library(tidyverse)
library(ggpubr)
library(knitr)
+library(mulset)
+library(caret)
data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL")
data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL")
-simon_wide <- data_simon %>%
+simon_agg <- data_simon %>%
+ group_by(donor_id, data_name) %>%
+ summarise(
+ data = mean(data)
+ )
+
+simon_outcome <- data_simon %>%
+ select(donor_id, outcome) %>%
group_by(donor_id) %>%
summarise(
- dup = duplicated(data_name)
- ) %>%
- filter(dup)
-simon_wide
+ outcome = unique(outcome)
+ ) %>%
+ ungroup() %>%
+ arrange(donor_id)
+simon_outcome
+
+
+simon_wide <- simon_agg %>%
+ pivot_wider(
+ id_cols = donor_id,
+ names_from = data_name,
+ values_from = data,
+ ) %>%
+ ungroup() %>%
+ arrange(donor_id) %>%
+ mutate(outcome = as.numeric(simon_outcome$outcome)) %>%
+ select(donor_id, outcome, everything())
+
+simon_nas <- sum(is.na(simon_wide))
+simon_cells <- (dim(simon_wide)[1] * dim(simon_wide)[2])
+simon_sparseness <- sum(is.na(simon_wide)) / (dim(simon_wide)[1] * dim(simon_wide)[2])
dview <- data_simon %>%
group_by(donor_id) %>%
mutate(dup = duplicated(data_name)) %>%
filter(dup) %>%
arrange(data_name) %>%
- filter(donor_id == 285 & data_name == "CD4_pos_T_cells") %>%
- kable(format = "latex", booktabs = TRUE)
+ filter(donor_id == 285) # & data_name == "CD4_pos_T_cells")
+
+mike_agg <- data_mike %>%
+ group_by(donor_id, name_formatted) %>%
+ summarise(
+ data = mean(data)
+ )
-%>%
- rowid_to_column() %>%
+mike_wide <- mike_agg %>%
pivot_wider(
- id_cols = donor_id,
- names_from = data_name,
- values_from = data,
+ id_cols = donor_id,
+ names_from = name_formatted,
+ values_from = data,
)
-simon_wide
+
+mike_nas <- sum(is.na(mike_wide))
+mike_cells <- (dim(mike_wide)[1] * dim(mike_wide)[2])
+mike_sparseness <- sum(is.na(mike_wide)) / (dim(mike_wide)[1] * dim(mike_wide)[2])
+
+simon_mulset <- mulset(simon_wide)
+
+# set1 <- simon_mulset$`1`
+# set1$features_hash
+# set1$feature_count
+# set1$features
+# set1$samples
+# set1$samples_count
+# set1$datapoints
+
+sets <- list()
+count <- 1
+for (set in simon_mulset) {
+ if (set$feature_count >= 5 & set$samples_count >= 15) {
+ sets[[count]] <- simon_wide[set$features] %>%
+ drop_na() %>%
+ select(donor_id, outcome, everything())
+ print(paste("rows:", nrow(sets[[count]]), ", samples by mulset:", set$samples_count))
+ count <- count + 1
+ }
+}
+
+
+sets_partitions <- list()
+set.seed(13121994)
+count <- 1
+for (set in sets) {
+ set$outcome <- as.factor(set$outcome)
+ partitions <- createDataPartition(set$outcome, p = 0.75)
+ training_rows <- partitions$Resample1
+ train <- set[training_rows, ]
+ test <- set[-training_rows, ]
+ if (nrow(test) >= 10) {
+ sets_partitions[[count]] <- list()
+ sets_partitions[[count]][["donors"]] <- set[['donor_id']]
+ sets_partitions[[count]][["train"]] <- train
+ sets_partitions[[count]][["test"]] <- test
+ sets_partitions[[count]][["totalOutcomes"]] <- table(set$outcome)
+ sets_partitions[[count]][["trainingOutcomes"]] <- table(train$outcome)
+ sets_partitions[[count]][["trainingRows"]] <- nrow(train)
+ sets_partitions[[count]][["testOutcomes"]] <- table(test$outcome)
+ sets_partitions[[count]][["testRows"]] <- nrow(test)
+ count <- count + 1
+ }
+}
+
+
+tbl <- tibble(dataset = c(), `Rows x Cols` = c(), `total (low / high)` = c(), `train (low / high)` = c(), `test (low / high)` = c())
+count = 1
+for (set in sets_partitions) {
+ rows <- nrow(set[["train"]]) + nrow(set[["test"]])
+ cols <- ncol(set[["train"]])
+ rowsxcols <- paste(rows, "x", cols)
+ tot <- paste(set[["totalOutcomes"]][1], "/", set[["totalOutcomes"]][2], "(", round((set[["totalOutcomes"]][1]) / rows, 2) , ")")
+ train <- paste(set[["trainingOutcomes"]][1], "/", set[["trainingOutcomes"]][2])
+ test <- paste(set[["testOutcomes"]][1], "/", set[["testOutcomes"]][2])
+ tbl <- tbl %>% add_row(
+ dataset = count,
+ `Rows x Cols` = rowsxcols,
+ `total (low / high)` = tot,
+ `train (low / high)` = train,
+ `test (low / high)` = test
+ )
+ count = count + 1
+}
+tbl %>%
+kable(format = "latex", booktabs = TRUE)
+
diff --git a/scripts/modelling.R b/scripts/modelling.R
new file mode 100644
index 0000000..b1fc746
--- /dev/null
+++ b/scripts/modelling.R
@@ -0,0 +1,39 @@
+library(caret)
+library(tidyverse)
+library(MLeval)
+
+source("./data_prep.R")
+
+data_list <- sets_partitions
+results <- list()
+models <- c("rrlda", "naive_bayes", "rf", "regLogistic")
+fitControl <- trainControl( ## 10-fold CV
+ method = "repeatedcv",
+ number = 10,
+ classProbs = TRUE,
+ savePredictions = TRUE,
+ repeats = 2
+)
+for (model in models) {
+ dataset = 1
+ # for (data in data_list) {
+ for (data in data_list[c(14, 16, 19)]) {
+ print(paste("Training", model, "on dataset", dataset))
+ train <- data[["train"]]
+ X_train <- as.data.frame(train[-c(1, 2)])
+ Y_train <- train[c(2)][[1]]
+ levels(Y_train) <- c("Low", "High")
+ set.seed(13121994)
+ model_trained <- train(
+ X_train,
+ y = Y_train,
+ method = model,
+ trControl = fitControl
+ )
+ results[[model]][[dataset]] <- model_trained
+ dataset = dataset + 1
+ }
+}
+save(results, file="./modelling_results_withrrlda.RData")
+# save(results, file="./modelling_results.RData")
+
diff --git a/scripts/modelling_eval.R b/scripts/modelling_eval.R
new file mode 100644
index 0000000..1c99db7
--- /dev/null
+++ b/scripts/modelling_eval.R
@@ -0,0 +1,175 @@
+library(tidyverse)
+library(caret)
+library(MLeval)
+library(pROC)
+
+
+source("./data_prep.R")
+# load("./modelling_results.RData")
+load("modelling_results_withrrlda.RData")
+
+second_visit_donors <- mike_wide[["donor_id"]]
+second_visit_donors
+
+second_visits <- c()
+results <- results
+data_list <- sets_partitions
+donor_list <- list()
+feature_list <- list()
+count = 1
+for (set in data_list) {
+ donors <- c(set[["train"]][["donor_id"]], set[['test']][['donor_id']])
+ features <- names(set[["train"]])
+ donor_list[[count]] <- donors
+ feature_list[[count]] <- features
+ second_visits <- c(second_visits, length(intersect(donors, second_visit_donors)))
+ count = count + 1
+}
+second_visits
+
+donor_list <- donor_list[c(14, 16, 19)]
+feature_list <- feature_list[c(14, 16, 19)]
+
+donor_intersect <- intersection(donor_list[[1]], donor_list[[2]], donor_list[[3]])
+n_donor_intersect <- length(donor_intersect)
+
+feature_intersect1 <- intersection(feature_list[[1]], feature_list[[2]])
+n_feature_intersect1 <- length(feature_intersect1)
+
+# which(second_visits > 15)
+# which.max(second_visits)
+
+# dataset14_rrlda_eval <- evalm(results[["rrlda"]][[14]])
+# dataset14_nb_eval <- evalm(results[["naive_bayes"]][[14]])
+# dataset14_rf_eval <- evalm(results[["rf"]][[14]])
+# dataset14_reglog_eval <- evalm(results[["regLogistic"]][[14]])
+#
+# dataset16_rrlda_eval <- evalm(results[["rrlda"]][[16]])
+# dataset16_nb_eval <- evalm(results[["naive_bayes"]][[16]])
+# dataset16_rf_eval <- evalm(results[["rf"]][[16]])
+# dataset16_reglog_eval <- evalm(results[["regLogistic"]][[16]])
+#
+# dataset19_rrlda_eval <- evalm(results[["rrlda"]][[19]])
+# dataset19_nb_eval <- evalm(results[["naive_bayes"]][[19]])
+# dataset19_rf_eval <- evalm(results[["rf"]][[19]])
+# dataset19_reglog_eval <- evalm(results[["regLogistic"]][[19]])
+
+dataset1_rrlda_eval <- evalm(results[["rrlda"]][[1]])
+dataset1_nb_eval <- evalm(results[["naive_bayes"]][[1]])
+dataset1_rf_eval <- evalm(results[["rf"]][[1]])
+dataset1_reglog_eval <- evalm(results[["regLogistic"]][[1]])
+
+dataset2_rrlda_eval <- evalm(results[["rrlda"]][[2]])
+dataset2_nb_eval <- evalm(results[["naive_bayes"]][[2]])
+dataset2_rf_eval <- evalm(results[["rf"]][[2]])
+dataset2_reglog_eval <- evalm(results[["regLogistic"]][[2]])
+
+dataset3_rrlda_eval <- evalm(results[["rrlda"]][[3]])
+dataset3_nb_eval <- evalm(results[["naive_bayes"]][[3]])
+dataset3_rf_eval <- evalm(results[["rf"]][[3]])
+dataset3_reglog_eval <- evalm(results[["regLogistic"]][[3]])
+
+
+get_test_auc <- function(dataset, model) {
+ used_datasets <- c(14,16,19)
+ test_data <- data_list[[used_datasets[dataset]]][["test"]]
+ print(test_data)
+ model_res <- results[[model]][[dataset]]
+ preds <- predict(model_res, newdata = test_data[-c(1, 2)])
+ preds <- as.numeric(preds)
+ auc_score <- auc(roc(
+ response = test_data[["outcome"]],
+ predictor = preds,
+ ret = c("roc")
+ ))
+ round(as.numeric(auc_score), 2)
+}
+
+
+
+d1_rrlda_met <- as_tibble(dataset1_rrlda_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "rrlda")
+d1_nb_met <- as_tibble(dataset1_nb_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "nb")
+d1_rf_met <- as_tibble(dataset1_rf_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "rf")
+d1_reglog_met <- as_tibble(dataset1_reglog_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "reglog")
+d1_met <- bind_rows(list(d1_rrlda_met, d1_nb_met, d1_rf_met, d1_reglog_met)) %>%
+ mutate(
+ `Score (CI)` = paste(Score, "(", CI, ")")
+ ) %>%
+ pivot_wider(
+ id_cols = model,
+ values_from = Score,
+ names_from = Metric
+ ) %>%
+ mutate(
+ `test_AUC` = c(
+ get_test_auc(1, "rrlda"),
+ get_test_auc(1, "naive_bayes"),
+ get_test_auc(1, "rf"),
+ get_test_auc(1, "regLogistic")
+ )
+ ) %>%
+ select(-`AUC-PRG`, -`AUC-PR`, -Informedness) %>%
+ kable(format = "latex", booktabs = TRUE)
+d1_met
+
+d2_rrlda_met <- as_tibble(dataset2_rrlda_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "rrlda")
+d2_nb_met <- as_tibble(dataset2_nb_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "nb")
+d2_rf_met <- as_tibble(dataset2_rf_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "rf")
+d2_reglog_met <- as_tibble(dataset2_reglog_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "reglog")
+d2_met <- bind_rows(list(d2_rrlda_met, d2_nb_met, d2_rf_met, d2_reglog_met)) %>%
+ mutate(
+ `Score (CI)` = paste(Score, "(", CI, ")")
+ ) %>%
+ pivot_wider(
+ id_cols = model,
+ values_from = Score,
+ names_from = Metric
+ ) %>%
+ mutate(
+ `test_AUC` = c(
+ get_test_auc(2, "rrlda"),
+ get_test_auc(2, "naive_bayes"),
+ get_test_auc(2, "rf"),
+ get_test_auc(2, "regLogistic")
+ )
+ ) %>%
+ select(-`AUC-PRG`, -`AUC-PR`, -Informedness) %>%
+ kable(format = "latex", booktabs = TRUE)
+d2_met
+
+d3_rrlda_met <- as_tibble(dataset3_rrlda_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "rrlda")
+d3_nb_met <- as_tibble(dataset3_nb_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "nb")
+d3_rf_met <- as_tibble(dataset3_rf_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "rf")
+d3_reglog_met <- as_tibble(dataset3_reglog_eval$stdres$`Group 1`, rownames = "Metric") %>%
+ mutate(model = "reglog")
+d3_met <- bind_rows(list(d3_rrlda_met, d3_nb_met, d3_rf_met, d3_reglog_met)) %>%
+ mutate(
+ `Score (CI)` = paste(Score, "(", CI, ")")
+ ) %>%
+ pivot_wider(
+ id_cols = model,
+ values_from = Score,
+ names_from = Metric
+ ) %>%
+ mutate(
+ `test_AUC` = c(
+ get_test_auc(3, "rrlda"),
+ get_test_auc(3, "naive_bayes"),
+ get_test_auc(3, "rf"),
+ get_test_auc(3, "regLogistic")
+ )
+ ) %>%
+ select(-`AUC-PRG`, -`AUC-PR`, -Informedness) %>%
+ kable(format = "latex", booktabs = TRUE)
+d3_met
diff --git a/scripts/modelling_exploration.R b/scripts/modelling_exploration.R
new file mode 100644
index 0000000..762497f
--- /dev/null
+++ b/scripts/modelling_exploration.R
@@ -0,0 +1,443 @@
+library(tidyverse)
+library(caret)
+library(ggpubr)
+library(corrplot)
+library(psych)
+
+load("modelling_results_withrrlda.RData")
+source("data_prep.R")
+source("data_prep_desc.R")
+
+results <- results
+important_features <- list()
+for (model in results) {
+ for (dataset in c(1,2,3)) {
+ model_name <- model[[dataset]]$method
+ # important_features[[model_name]] <- list()
+ important_features[[model_name]][[dataset]] <- varImp(model[[dataset]])
+ }
+}
+
+important_features$naive_bayes[[1]]
+important_features$naive_bayes[[2]]
+
+nb_importance1 <- important_features$naive_bayes[[1]]$importance %>%
+ as_tibble(rownames = 'Feature') %>%
+ mutate(Score = High) %>%
+ select(Feature, Score) %>%
+ filter(Score >= 50) %>%
+ arrange(Score) %>%
+ mutate(Feature = factor(Feature, Feature))
+nb_importance1
+
+nb_importance2 <- important_features$naive_bayes[[2]]$importance %>%
+ as_tibble(rownames = 'Feature') %>%
+ mutate(Score = High) %>%
+ select(Feature, Score) %>%
+ filter(Score >= 50) %>%
+ arrange(Score) %>%
+ mutate(Feature = factor(Feature, Feature))
+nb_importance2
+
+importance_plt1 <- nb_importance1 %>%
+ ggplot(aes(Score, Feature)) +
+ geom_bar(stat="identity", fill = "green4") +
+ theme_pubclean()
+importance_plt1
+
+importance_plt2 <- nb_importance2 %>%
+ ggplot(aes(Score, Feature)) +
+ geom_bar(stat="identity", fill = "green4") +
+ theme_pubclean()
+importance_plt2
+
+
+top3_features1 <- nb_importance1 %>%
+ arrange(desc(Score)) %>%
+ slice(1:3,) %>%
+ select(Feature)
+
+top3_features1 <- as.character(top3_features1[['Feature']])
+
+top3_features2 <- nb_importance2 %>%
+ arrange(desc(Score)) %>%
+ slice(1:3,) %>%
+ select(Feature)
+
+top3_features2 <- as.character(top3_features2[['Feature']])
+
+data1 <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>%
+ select(donor_id, outcome, top3_features1) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>%
+ pivot_longer(
+ cols = top3_features1,
+ names_to = "Feature"
+ ) %>%
+ group_by(outcome, Feature) %>%
+ summarise(
+ Feature = Feature,
+ outcome_dummy = outcome_dummy,
+ donor_id = donor_id,
+ value = value
+ )
+xlabels_1 <- data1 %>%
+ group_map(
+ ~ {
+ rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.))
+ }
+ )
+data1 <- data1 %>%
+ ungroup() %>%
+ mutate(label = unlist(xlabels_1))
+
+
+
+f <- function(x, height = 1) {
+ ans <- median(x)
+ data.frame(ymin = ans - height / 2, ymax = ans + height / 2, y = ans)
+}
+
+# 90th percentile values
+plt1_top3 <- data1 %>%
+ ggplot(aes(label, value, fill=outcome)) +
+ geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+
+data2 <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>%
+ select(donor_id, outcome, top3_features2) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>%
+ pivot_longer(
+ cols = top3_features2,
+ names_to = "Feature"
+ ) %>%
+ group_by(outcome, Feature) %>%
+ summarise(
+ Feature = Feature,
+ outcome_dummy = outcome_dummy,
+ donor_id = donor_id,
+ value = value
+ )
+xlabels_2 <- data2 %>%
+ group_map(
+ ~ {
+ rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.))
+ }
+ )
+data2 <- data2 %>%
+ ungroup() %>%
+ mutate(label = unlist(xlabels_2))
+
+
+
+# 90th percentile values
+plt2_top3 <- data2 %>%
+ ggplot(aes(label, value, fill=outcome)) +
+ geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+plt2_top3
+
+
+############# Second visit data here ##################
+
+## DATASET 2
+
+data2_second <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data2$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ bind_rows(data2 %>%
+ mutate(visit = rep("First", nrow(.))) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id))
+ )
+data2_second
+
+data2_second_top3 <- data2_second %>%
+ filter(Feature %in% top3_features2) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H")),
+ visit_dummy = visit) %>%
+ group_by(donor_id) %>%
+ arrange(donor_id) %>%
+ filter(1 < length(unique(visit))) %>%
+ ungroup() %>%
+ group_by(visit, outcome)
+data2_second_top3
+
+xlabels_2_second_top3 <- data2_second_top3 %>%
+ group_map(
+ ~ {
+ paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")")
+ }
+ )
+xlabels_2_second_top3
+
+data2_second_top3 <- data2_second_top3 %>%
+ ungroup() %>%
+ mutate(
+ label = if_else(outcome == 1,
+ xlabels_2_second_top3[[2]],
+ if_else(visit == "First",
+ xlabels_2_second_top3[[1]],
+ xlabels_2_second_top3[[3]]
+ )
+ )
+ ) %>%
+ filter(value < 1000 & 0 < value) %>%
+ group_by(donor_id) %>%
+ mutate(changed = 1 < length(unique(outcome)))
+data2_second_top3
+# 6 outliers
+
+# 90th percentile values
+plt2_second_top3 <- data2_second_top3 %>%
+ ggplot(aes(label, value)) +
+ geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+plt2_second_top3
+
+######################################## dataset 1
+data1_second <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data1$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ bind_rows(data1 %>%
+ mutate(visit = rep("First", nrow(.))) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id))
+ )
+data1_second
+
+data1_second_top3 <- data1_second %>%
+ filter(Feature %in% top3_features1) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H")),
+ visit_dummy = visit) %>%
+ group_by(donor_id) %>%
+ filter(1 < length(unique(visit))) %>%
+ arrange(donor_id) %>%
+ ungroup() %>%
+ group_by(visit, outcome)
+data1_second_top3
+xlabels_1_second_top3 <- data1_second_top3 %>%
+ group_map(
+ ~ {
+ paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")")
+ }
+ )
+xlabels_1_second_top3
+data1_second_top3 <- data1_second_top3 %>%
+ ungroup() %>%
+ mutate(
+ label = if_else(outcome == 1,
+ xlabels_1_second_top3[[2]],
+ if_else(visit == "First",
+ xlabels_1_second_top3[[1]],
+ xlabels_1_second_top3[[3]]
+ )
+ )
+ ) %>%
+ filter(value < 1000 & 0 < value) %>%
+ group_by(donor_id) %>%
+ mutate(changed = 1 < length(unique(outcome)))
+data1_second_top3
+
+# 90th percentile values
+plt1_second_top3 <- data1_second_top3 %>%
+ ggplot(aes(label, value)) +
+ geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+plt1_second_top3
+
+
+### Arranging the plot
+
+
+data1_top <- ggarrange(
+ importance_plt1,
+ plt1_top3,
+ widths = c(1,2),
+ labels = c("A", "B")
+)
+data1_bot <- ggarrange(
+ plt1_second_top3,
+ labels = "C"
+ )
+whole_data1 <- ggarrange(
+ data1_top,
+ data1_bot,
+ nrow = 2
+ )
+whole_data1
+ggsave("../images/dataset1_nb_feature_exploration.png", whole_data1, width = 2 * 15, height = 19, dpi=300, units = "cm")
+
+
+data2_top <- ggarrange(
+ importance_plt2,
+ plt2_top3,
+ widths = c(1,2),
+ labels = c("A", "B")
+ )
+data2_bot <- ggarrange(
+ plt2_second_top3,
+ labels = c("C")
+ )
+whole_data2 <- ggarrange(
+ data2_top,
+ data2_bot,
+ nrow = 2
+ )
+whole_data2
+ggsave("../images/dataset2_nb_feature_exploration.png", whole_data2, width = 2 * 15, height = 19, dpi=300, units = "cm")
+
+
+#################### correlations data 1
+
+data1_wide <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>%
+ select(-donor_id, -outcome)
+data1_wide
+
+M <- cor(data1_wide)
+# pmat <- cor.mtest(data1_wide)
+pmat <- psych::corr.test(M, method="pearson", adjust="BH")$p
+
+png(file = "../images/cor_dataset1.png",
+ height = 1.5 * 19,
+ width = 2 * 15,
+ res = 300,
+ units = "cm"
+ )
+corrplot(M, order="hclust", p.mat=pmat, sig.level=0.0001, insig="blank")
+dev.off()
+
+#################### correlations data 2
+
+data2_wide <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>%
+ select(-donor_id, -outcome)
+data2_wide
+
+M <- cor(data2_wide)
+# pmat <- cor.mtest(data2_wide)
+pmat <- psych::corr.test(M, method="pearson", adjust="none")$p
+
+png(file = "../images/cor_dataset2.png",
+ height = 1.5 * 19,
+ width = 2 * 15,
+ res = 300,
+ units = "cm"
+ )
+corrplot(M, order="hclust", p.mat=pmat, sig.level=0.05, insig="blank")
+dev.off()
+
+#################### difference heatmap data 1
+
+after <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data1$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value) %>%
+ filter(Feature %in% names(data1_wide)) %>%
+ arrange(donor_id, Feature)
+before <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>%
+ pivot_longer(
+ cols = c(3:last_col()),
+ names_to = "Feature"
+ ) %>%
+ select(donor_id, Feature, value) %>%
+ filter(donor_id %in% after$donor_id) %>%
+ arrange(donor_id, Feature)
+diff1 <- after %>%
+ mutate(before_value = before$value) %>%
+ filter(0 < value) %>%
+ mutate(top3 = if_else(Feature %in% top3_features1, 1, if_else(Feature %in% top3_features2, 2, 0))) %>%
+ mutate(value = log2(value / before_value)) %>%
+ mutate(top3 =factor(top3, labels=c("Not in top3", "Top3 dataset 14", "Top3 dataset 16"))) %>%
+ ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) +
+ geom_violin() +
+ theme_pubclean() +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+ labs(x = "PBMC features", y = "log2 change", fill = "")
+diff1
+ggsave("../images/second_visit_change1.png", diff1, width = 2 * 15, height = 19, dpi=300, units = "cm")
+
+#################### difference heatmap data 2
+
+after <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data2$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value) %>%
+ filter(Feature %in% names(data2_wide)) %>%
+ arrange(donor_id, Feature)
+before <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>%
+ pivot_longer(
+ cols = c(3:last_col()),
+ names_to = "Feature"
+ ) %>%
+ select(donor_id, Feature, value) %>%
+ filter(donor_id %in% after$donor_id) %>%
+ arrange(donor_id, Feature)
+diff2 <- after %>%
+ mutate(before_value = before$value) %>%
+ filter(0 < value) %>%
+ mutate(top3 = Feature %in% top3_features1) %>%
+ mutate(value = log2(value / before_value)) %>%
+ ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) +
+ geom_violin(show.legend=F) +
+ theme_pubclean() +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+ labs(x = "PBMC features", y = "Unsigned log2 change")
+diff2
+ggsave("../images/second_visit_change2.png", diff2, width = 2 * 15, height = 19, dpi=300, units = "cm")
diff --git a/scripts/modelling_results.RData b/scripts/modelling_results.RData
new file mode 100644
index 0000000..0b666c5
--- /dev/null
+++ b/scripts/modelling_results.RData
Binary files differ
diff --git a/scripts/modelling_results_withrrlda.RData b/scripts/modelling_results_withrrlda.RData
new file mode 100644
index 0000000..bd5e6e3
--- /dev/null
+++ b/scripts/modelling_results_withrrlda.RData
Binary files differ
diff --git a/scripts/sam.R b/scripts/sam.R
new file mode 100644
index 0000000..d94794d
--- /dev/null
+++ b/scripts/sam.R
@@ -0,0 +1,49 @@
+library(samr)
+library(tidyverse)
+
+# assumes modelling exploration is loaded in the R session
+data1_sam <- data1 %>%
+ select(donor_id, outcome, Feature, value) %>%
+ mutate(outcome = factor(outcome, labels = c(1, 2))) %>%
+ pivot_wider(
+ names_from = Feature,
+ values_from = value
+ ) %>%
+ select(-donor_id)
+
+data1_sam_y <- data1_sam[['outcome']]
+data1_sam_x <- t(as.data.frame(data1_sam[-1]))
+
+samobj1 <- samr::SAM(
+ data1_sam_x,
+ data1_sam_y,
+ resp.type="Two class unpaired",
+ fdr.output = 0.5,
+ nperms = 1000,
+ genenames = rownames(data1_sam_x)
+)
+samobj1
+
+
+############# DATASET 2
+data2_sam <- data2 %>%
+ select(donor_id, outcome, Feature, value) %>%
+ mutate(outcome = factor(outcome, labels = c(1, 2))) %>%
+ pivot_wider(
+ names_from = Feature,
+ values_from = value
+ ) %>%
+ select(-donor_id)
+
+data2_sam_y <- data2_sam[['outcome']]
+data2_sam_x <- t(as.data.frame(data2_sam[-1]))
+
+samobj2 <- samr::SAM(
+ data2_sam_x,
+ data2_sam_y,
+ resp.type="Two class unpaired",
+ fdr.output = 0.01,
+ nperms = 1000,
+ genenames = rownames(data2_sam_x)
+)
+samobj2