summaryrefslogtreecommitdiff
path: root/scripts/modelling_exploration.R
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/modelling_exploration.R')
-rw-r--r--scripts/modelling_exploration.R443
1 files changed, 443 insertions, 0 deletions
diff --git a/scripts/modelling_exploration.R b/scripts/modelling_exploration.R
new file mode 100644
index 0000000..762497f
--- /dev/null
+++ b/scripts/modelling_exploration.R
@@ -0,0 +1,443 @@
+library(tidyverse)
+library(caret)
+library(ggpubr)
+library(corrplot)
+library(psych)
+
+load("modelling_results_withrrlda.RData")
+source("data_prep.R")
+source("data_prep_desc.R")
+
+results <- results
+important_features <- list()
+for (model in results) {
+ for (dataset in c(1,2,3)) {
+ model_name <- model[[dataset]]$method
+ # important_features[[model_name]] <- list()
+ important_features[[model_name]][[dataset]] <- varImp(model[[dataset]])
+ }
+}
+
+important_features$naive_bayes[[1]]
+important_features$naive_bayes[[2]]
+
+nb_importance1 <- important_features$naive_bayes[[1]]$importance %>%
+ as_tibble(rownames = 'Feature') %>%
+ mutate(Score = High) %>%
+ select(Feature, Score) %>%
+ filter(Score >= 50) %>%
+ arrange(Score) %>%
+ mutate(Feature = factor(Feature, Feature))
+nb_importance1
+
+nb_importance2 <- important_features$naive_bayes[[2]]$importance %>%
+ as_tibble(rownames = 'Feature') %>%
+ mutate(Score = High) %>%
+ select(Feature, Score) %>%
+ filter(Score >= 50) %>%
+ arrange(Score) %>%
+ mutate(Feature = factor(Feature, Feature))
+nb_importance2
+
+importance_plt1 <- nb_importance1 %>%
+ ggplot(aes(Score, Feature)) +
+ geom_bar(stat="identity", fill = "green4") +
+ theme_pubclean()
+importance_plt1
+
+importance_plt2 <- nb_importance2 %>%
+ ggplot(aes(Score, Feature)) +
+ geom_bar(stat="identity", fill = "green4") +
+ theme_pubclean()
+importance_plt2
+
+
+top3_features1 <- nb_importance1 %>%
+ arrange(desc(Score)) %>%
+ slice(1:3,) %>%
+ select(Feature)
+
+top3_features1 <- as.character(top3_features1[['Feature']])
+
+top3_features2 <- nb_importance2 %>%
+ arrange(desc(Score)) %>%
+ slice(1:3,) %>%
+ select(Feature)
+
+top3_features2 <- as.character(top3_features2[['Feature']])
+
+data1 <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>%
+ select(donor_id, outcome, top3_features1) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>%
+ pivot_longer(
+ cols = top3_features1,
+ names_to = "Feature"
+ ) %>%
+ group_by(outcome, Feature) %>%
+ summarise(
+ Feature = Feature,
+ outcome_dummy = outcome_dummy,
+ donor_id = donor_id,
+ value = value
+ )
+xlabels_1 <- data1 %>%
+ group_map(
+ ~ {
+ rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.))
+ }
+ )
+data1 <- data1 %>%
+ ungroup() %>%
+ mutate(label = unlist(xlabels_1))
+
+
+
+f <- function(x, height = 1) {
+ ans <- median(x)
+ data.frame(ymin = ans - height / 2, ymax = ans + height / 2, y = ans)
+}
+
+# 90th percentile values
+plt1_top3 <- data1 %>%
+ ggplot(aes(label, value, fill=outcome)) +
+ geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+
+data2 <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>%
+ select(donor_id, outcome, top3_features2) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>%
+ pivot_longer(
+ cols = top3_features2,
+ names_to = "Feature"
+ ) %>%
+ group_by(outcome, Feature) %>%
+ summarise(
+ Feature = Feature,
+ outcome_dummy = outcome_dummy,
+ donor_id = donor_id,
+ value = value
+ )
+xlabels_2 <- data2 %>%
+ group_map(
+ ~ {
+ rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.))
+ }
+ )
+data2 <- data2 %>%
+ ungroup() %>%
+ mutate(label = unlist(xlabels_2))
+
+
+
+# 90th percentile values
+plt2_top3 <- data2 %>%
+ ggplot(aes(label, value, fill=outcome)) +
+ geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+plt2_top3
+
+
+############# Second visit data here ##################
+
+## DATASET 2
+
+data2_second <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data2$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ bind_rows(data2 %>%
+ mutate(visit = rep("First", nrow(.))) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id))
+ )
+data2_second
+
+data2_second_top3 <- data2_second %>%
+ filter(Feature %in% top3_features2) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H")),
+ visit_dummy = visit) %>%
+ group_by(donor_id) %>%
+ arrange(donor_id) %>%
+ filter(1 < length(unique(visit))) %>%
+ ungroup() %>%
+ group_by(visit, outcome)
+data2_second_top3
+
+xlabels_2_second_top3 <- data2_second_top3 %>%
+ group_map(
+ ~ {
+ paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")")
+ }
+ )
+xlabels_2_second_top3
+
+data2_second_top3 <- data2_second_top3 %>%
+ ungroup() %>%
+ mutate(
+ label = if_else(outcome == 1,
+ xlabels_2_second_top3[[2]],
+ if_else(visit == "First",
+ xlabels_2_second_top3[[1]],
+ xlabels_2_second_top3[[3]]
+ )
+ )
+ ) %>%
+ filter(value < 1000 & 0 < value) %>%
+ group_by(donor_id) %>%
+ mutate(changed = 1 < length(unique(outcome)))
+data2_second_top3
+# 6 outliers
+
+# 90th percentile values
+plt2_second_top3 <- data2_second_top3 %>%
+ ggplot(aes(label, value)) +
+ geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+plt2_second_top3
+
+######################################## dataset 1
+data1_second <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data1$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ bind_rows(data1 %>%
+ mutate(visit = rep("First", nrow(.))) %>%
+ select(donor_id, outcome, Feature, value, visit) %>%
+ filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id))
+ )
+data1_second
+
+data1_second_top3 <- data1_second %>%
+ filter(Feature %in% top3_features1) %>%
+ mutate(outcome_dummy = factor(outcome, labels = c("L", "H")),
+ visit_dummy = visit) %>%
+ group_by(donor_id) %>%
+ filter(1 < length(unique(visit))) %>%
+ arrange(donor_id) %>%
+ ungroup() %>%
+ group_by(visit, outcome)
+data1_second_top3
+xlabels_1_second_top3 <- data1_second_top3 %>%
+ group_map(
+ ~ {
+ paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")")
+ }
+ )
+xlabels_1_second_top3
+data1_second_top3 <- data1_second_top3 %>%
+ ungroup() %>%
+ mutate(
+ label = if_else(outcome == 1,
+ xlabels_1_second_top3[[2]],
+ if_else(visit == "First",
+ xlabels_1_second_top3[[1]],
+ xlabels_1_second_top3[[3]]
+ )
+ )
+ ) %>%
+ filter(value < 1000 & 0 < value) %>%
+ group_by(donor_id) %>%
+ mutate(changed = 1 < length(unique(outcome)))
+data1_second_top3
+
+# 90th percentile values
+plt1_second_top3 <- data1_second_top3 %>%
+ ggplot(aes(label, value)) +
+ geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) +
+ facet_wrap(~ Feature, scales="free") +
+ geom_boxplot(width = 0.1, show.legend = F) +
+ geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) +
+ stat_summary(
+ fun.data = f, geom = "crossbar",
+ colour = NA, fill = "black", width = 0.3, alpha = 1.0
+ ) +
+ theme_pubclean() +
+ labs(x = "", y = "PBMC 90th percentile value")
+plt1_second_top3
+
+
+### Arranging the plot
+
+
+data1_top <- ggarrange(
+ importance_plt1,
+ plt1_top3,
+ widths = c(1,2),
+ labels = c("A", "B")
+)
+data1_bot <- ggarrange(
+ plt1_second_top3,
+ labels = "C"
+ )
+whole_data1 <- ggarrange(
+ data1_top,
+ data1_bot,
+ nrow = 2
+ )
+whole_data1
+ggsave("../images/dataset1_nb_feature_exploration.png", whole_data1, width = 2 * 15, height = 19, dpi=300, units = "cm")
+
+
+data2_top <- ggarrange(
+ importance_plt2,
+ plt2_top3,
+ widths = c(1,2),
+ labels = c("A", "B")
+ )
+data2_bot <- ggarrange(
+ plt2_second_top3,
+ labels = c("C")
+ )
+whole_data2 <- ggarrange(
+ data2_top,
+ data2_bot,
+ nrow = 2
+ )
+whole_data2
+ggsave("../images/dataset2_nb_feature_exploration.png", whole_data2, width = 2 * 15, height = 19, dpi=300, units = "cm")
+
+
+#################### correlations data 1
+
+data1_wide <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>%
+ select(-donor_id, -outcome)
+data1_wide
+
+M <- cor(data1_wide)
+# pmat <- cor.mtest(data1_wide)
+pmat <- psych::corr.test(M, method="pearson", adjust="BH")$p
+
+png(file = "../images/cor_dataset1.png",
+ height = 1.5 * 19,
+ width = 2 * 15,
+ res = 300,
+ units = "cm"
+ )
+corrplot(M, order="hclust", p.mat=pmat, sig.level=0.0001, insig="blank")
+dev.off()
+
+#################### correlations data 2
+
+data2_wide <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>%
+ select(-donor_id, -outcome)
+data2_wide
+
+M <- cor(data2_wide)
+# pmat <- cor.mtest(data2_wide)
+pmat <- psych::corr.test(M, method="pearson", adjust="none")$p
+
+png(file = "../images/cor_dataset2.png",
+ height = 1.5 * 19,
+ width = 2 * 15,
+ res = 300,
+ units = "cm"
+ )
+corrplot(M, order="hclust", p.mat=pmat, sig.level=0.05, insig="blank")
+dev.off()
+
+#################### difference heatmap data 1
+
+after <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data1$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value) %>%
+ filter(Feature %in% names(data1_wide)) %>%
+ arrange(donor_id, Feature)
+before <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>%
+ pivot_longer(
+ cols = c(3:last_col()),
+ names_to = "Feature"
+ ) %>%
+ select(donor_id, Feature, value) %>%
+ filter(donor_id %in% after$donor_id) %>%
+ arrange(donor_id, Feature)
+diff1 <- after %>%
+ mutate(before_value = before$value) %>%
+ filter(0 < value) %>%
+ mutate(top3 = if_else(Feature %in% top3_features1, 1, if_else(Feature %in% top3_features2, 2, 0))) %>%
+ mutate(value = log2(value / before_value)) %>%
+ mutate(top3 =factor(top3, labels=c("Not in top3", "Top3 dataset 14", "Top3 dataset 16"))) %>%
+ ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) +
+ geom_violin() +
+ theme_pubclean() +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+ labs(x = "PBMC features", y = "log2 change", fill = "")
+diff1
+ggsave("../images/second_visit_change1.png", diff1, width = 2 * 15, height = 19, dpi=300, units = "cm")
+
+#################### difference heatmap data 2
+
+after <- mike_second_visit %>%
+ ungroup() %>%
+ filter(donor_id %in% data2$donor_id) %>%
+ mutate(
+ Feature = name_formatted,
+ value = data,
+ visit = rep("Second", nrow(.)),
+ outcome = factor(outcome)
+ ) %>%
+ select(donor_id, outcome, Feature, value) %>%
+ filter(Feature %in% names(data2_wide)) %>%
+ arrange(donor_id, Feature)
+before <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>%
+ pivot_longer(
+ cols = c(3:last_col()),
+ names_to = "Feature"
+ ) %>%
+ select(donor_id, Feature, value) %>%
+ filter(donor_id %in% after$donor_id) %>%
+ arrange(donor_id, Feature)
+diff2 <- after %>%
+ mutate(before_value = before$value) %>%
+ filter(0 < value) %>%
+ mutate(top3 = Feature %in% top3_features1) %>%
+ mutate(value = log2(value / before_value)) %>%
+ ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) +
+ geom_violin(show.legend=F) +
+ theme_pubclean() +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+ labs(x = "PBMC features", y = "Unsigned log2 change")
+diff2
+ggsave("../images/second_visit_change2.png", diff2, width = 2 * 15, height = 19, dpi=300, units = "cm")