diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2021-05-02 17:33:26 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2021-05-02 17:33:26 +0200 |
| commit | bf1adece8aeb48e136085233d2f5ff2f9600eaf5 (patch) | |
| tree | 6a46b0c7e7fbea6a85c0e44714e0076251e82cac /scripts/modelling_exploration.R | |
| parent | de4565fe9290ec1f1031eed6f7d067794df53166 (diff) | |
update
Diffstat (limited to 'scripts/modelling_exploration.R')
| -rw-r--r-- | scripts/modelling_exploration.R | 443 |
1 files changed, 443 insertions, 0 deletions
diff --git a/scripts/modelling_exploration.R b/scripts/modelling_exploration.R new file mode 100644 index 0000000..762497f --- /dev/null +++ b/scripts/modelling_exploration.R @@ -0,0 +1,443 @@ +library(tidyverse) +library(caret) +library(ggpubr) +library(corrplot) +library(psych) + +load("modelling_results_withrrlda.RData") +source("data_prep.R") +source("data_prep_desc.R") + +results <- results +important_features <- list() +for (model in results) { + for (dataset in c(1,2,3)) { + model_name <- model[[dataset]]$method + # important_features[[model_name]] <- list() + important_features[[model_name]][[dataset]] <- varImp(model[[dataset]]) + } +} + +important_features$naive_bayes[[1]] +important_features$naive_bayes[[2]] + +nb_importance1 <- important_features$naive_bayes[[1]]$importance %>% + as_tibble(rownames = 'Feature') %>% + mutate(Score = High) %>% + select(Feature, Score) %>% + filter(Score >= 50) %>% + arrange(Score) %>% + mutate(Feature = factor(Feature, Feature)) +nb_importance1 + +nb_importance2 <- important_features$naive_bayes[[2]]$importance %>% + as_tibble(rownames = 'Feature') %>% + mutate(Score = High) %>% + select(Feature, Score) %>% + filter(Score >= 50) %>% + arrange(Score) %>% + mutate(Feature = factor(Feature, Feature)) +nb_importance2 + +importance_plt1 <- nb_importance1 %>% + ggplot(aes(Score, Feature)) + + geom_bar(stat="identity", fill = "green4") + + theme_pubclean() +importance_plt1 + +importance_plt2 <- nb_importance2 %>% + ggplot(aes(Score, Feature)) + + geom_bar(stat="identity", fill = "green4") + + theme_pubclean() +importance_plt2 + + +top3_features1 <- nb_importance1 %>% + arrange(desc(Score)) %>% + slice(1:3,) %>% + select(Feature) + +top3_features1 <- as.character(top3_features1[['Feature']]) + +top3_features2 <- nb_importance2 %>% + arrange(desc(Score)) %>% + slice(1:3,) %>% + select(Feature) + +top3_features2 <- as.character(top3_features2[['Feature']]) + +data1 <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>% + select(donor_id, outcome, top3_features1) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>% + pivot_longer( + cols = top3_features1, + names_to = "Feature" + ) %>% + group_by(outcome, Feature) %>% + summarise( + Feature = Feature, + outcome_dummy = outcome_dummy, + donor_id = donor_id, + value = value + ) +xlabels_1 <- data1 %>% + group_map( + ~ { + rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.)) + } + ) +data1 <- data1 %>% + ungroup() %>% + mutate(label = unlist(xlabels_1)) + + + +f <- function(x, height = 1) { + ans <- median(x) + data.frame(ymin = ans - height / 2, ymax = ans + height / 2, y = ans) +} + +# 90th percentile values +plt1_top3 <- data1 %>% + ggplot(aes(label, value, fill=outcome)) + + geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") + +data2 <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>% + select(donor_id, outcome, top3_features2) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>% + pivot_longer( + cols = top3_features2, + names_to = "Feature" + ) %>% + group_by(outcome, Feature) %>% + summarise( + Feature = Feature, + outcome_dummy = outcome_dummy, + donor_id = donor_id, + value = value + ) +xlabels_2 <- data2 %>% + group_map( + ~ { + rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.)) + } + ) +data2 <- data2 %>% + ungroup() %>% + mutate(label = unlist(xlabels_2)) + + + +# 90th percentile values +plt2_top3 <- data2 %>% + ggplot(aes(label, value, fill=outcome)) + + geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") +plt2_top3 + + +############# Second visit data here ################## + +## DATASET 2 + +data2_second <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data2$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value, visit) %>% + bind_rows(data2 %>% + mutate(visit = rep("First", nrow(.))) %>% + select(donor_id, outcome, Feature, value, visit) %>% + filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id)) + ) +data2_second + +data2_second_top3 <- data2_second %>% + filter(Feature %in% top3_features2) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H")), + visit_dummy = visit) %>% + group_by(donor_id) %>% + arrange(donor_id) %>% + filter(1 < length(unique(visit))) %>% + ungroup() %>% + group_by(visit, outcome) +data2_second_top3 + +xlabels_2_second_top3 <- data2_second_top3 %>% + group_map( + ~ { + paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")") + } + ) +xlabels_2_second_top3 + +data2_second_top3 <- data2_second_top3 %>% + ungroup() %>% + mutate( + label = if_else(outcome == 1, + xlabels_2_second_top3[[2]], + if_else(visit == "First", + xlabels_2_second_top3[[1]], + xlabels_2_second_top3[[3]] + ) + ) + ) %>% + filter(value < 1000 & 0 < value) %>% + group_by(donor_id) %>% + mutate(changed = 1 < length(unique(outcome))) +data2_second_top3 +# 6 outliers + +# 90th percentile values +plt2_second_top3 <- data2_second_top3 %>% + ggplot(aes(label, value)) + + geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") +plt2_second_top3 + +######################################## dataset 1 +data1_second <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data1$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value, visit) %>% + bind_rows(data1 %>% + mutate(visit = rep("First", nrow(.))) %>% + select(donor_id, outcome, Feature, value, visit) %>% + filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id)) + ) +data1_second + +data1_second_top3 <- data1_second %>% + filter(Feature %in% top3_features1) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H")), + visit_dummy = visit) %>% + group_by(donor_id) %>% + filter(1 < length(unique(visit))) %>% + arrange(donor_id) %>% + ungroup() %>% + group_by(visit, outcome) +data1_second_top3 +xlabels_1_second_top3 <- data1_second_top3 %>% + group_map( + ~ { + paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")") + } + ) +xlabels_1_second_top3 +data1_second_top3 <- data1_second_top3 %>% + ungroup() %>% + mutate( + label = if_else(outcome == 1, + xlabels_1_second_top3[[2]], + if_else(visit == "First", + xlabels_1_second_top3[[1]], + xlabels_1_second_top3[[3]] + ) + ) + ) %>% + filter(value < 1000 & 0 < value) %>% + group_by(donor_id) %>% + mutate(changed = 1 < length(unique(outcome))) +data1_second_top3 + +# 90th percentile values +plt1_second_top3 <- data1_second_top3 %>% + ggplot(aes(label, value)) + + geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") +plt1_second_top3 + + +### Arranging the plot + + +data1_top <- ggarrange( + importance_plt1, + plt1_top3, + widths = c(1,2), + labels = c("A", "B") +) +data1_bot <- ggarrange( + plt1_second_top3, + labels = "C" + ) +whole_data1 <- ggarrange( + data1_top, + data1_bot, + nrow = 2 + ) +whole_data1 +ggsave("../images/dataset1_nb_feature_exploration.png", whole_data1, width = 2 * 15, height = 19, dpi=300, units = "cm") + + +data2_top <- ggarrange( + importance_plt2, + plt2_top3, + widths = c(1,2), + labels = c("A", "B") + ) +data2_bot <- ggarrange( + plt2_second_top3, + labels = c("C") + ) +whole_data2 <- ggarrange( + data2_top, + data2_bot, + nrow = 2 + ) +whole_data2 +ggsave("../images/dataset2_nb_feature_exploration.png", whole_data2, width = 2 * 15, height = 19, dpi=300, units = "cm") + + +#################### correlations data 1 + +data1_wide <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>% + select(-donor_id, -outcome) +data1_wide + +M <- cor(data1_wide) +# pmat <- cor.mtest(data1_wide) +pmat <- psych::corr.test(M, method="pearson", adjust="BH")$p + +png(file = "../images/cor_dataset1.png", + height = 1.5 * 19, + width = 2 * 15, + res = 300, + units = "cm" + ) +corrplot(M, order="hclust", p.mat=pmat, sig.level=0.0001, insig="blank") +dev.off() + +#################### correlations data 2 + +data2_wide <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>% + select(-donor_id, -outcome) +data2_wide + +M <- cor(data2_wide) +# pmat <- cor.mtest(data2_wide) +pmat <- psych::corr.test(M, method="pearson", adjust="none")$p + +png(file = "../images/cor_dataset2.png", + height = 1.5 * 19, + width = 2 * 15, + res = 300, + units = "cm" + ) +corrplot(M, order="hclust", p.mat=pmat, sig.level=0.05, insig="blank") +dev.off() + +#################### difference heatmap data 1 + +after <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data1$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value) %>% + filter(Feature %in% names(data1_wide)) %>% + arrange(donor_id, Feature) +before <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>% + pivot_longer( + cols = c(3:last_col()), + names_to = "Feature" + ) %>% + select(donor_id, Feature, value) %>% + filter(donor_id %in% after$donor_id) %>% + arrange(donor_id, Feature) +diff1 <- after %>% + mutate(before_value = before$value) %>% + filter(0 < value) %>% + mutate(top3 = if_else(Feature %in% top3_features1, 1, if_else(Feature %in% top3_features2, 2, 0))) %>% + mutate(value = log2(value / before_value)) %>% + mutate(top3 =factor(top3, labels=c("Not in top3", "Top3 dataset 14", "Top3 dataset 16"))) %>% + ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) + + geom_violin() + + theme_pubclean() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + labs(x = "PBMC features", y = "log2 change", fill = "") +diff1 +ggsave("../images/second_visit_change1.png", diff1, width = 2 * 15, height = 19, dpi=300, units = "cm") + +#################### difference heatmap data 2 + +after <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data2$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value) %>% + filter(Feature %in% names(data2_wide)) %>% + arrange(donor_id, Feature) +before <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>% + pivot_longer( + cols = c(3:last_col()), + names_to = "Feature" + ) %>% + select(donor_id, Feature, value) %>% + filter(donor_id %in% after$donor_id) %>% + arrange(donor_id, Feature) +diff2 <- after %>% + mutate(before_value = before$value) %>% + filter(0 < value) %>% + mutate(top3 = Feature %in% top3_features1) %>% + mutate(value = log2(value / before_value)) %>% + ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) + + geom_violin(show.legend=F) + + theme_pubclean() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + labs(x = "PBMC features", y = "Unsigned log2 change") +diff2 +ggsave("../images/second_visit_change2.png", diff2, width = 2 * 15, height = 19, dpi=300, units = "cm") |
