diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2021-05-02 17:33:26 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2021-05-02 17:33:26 +0200 |
| commit | bf1adece8aeb48e136085233d2f5ff2f9600eaf5 (patch) | |
| tree | 6a46b0c7e7fbea6a85c0e44714e0076251e82cac /scripts | |
| parent | de4565fe9290ec1f1031eed6f7d067794df53166 (diff) | |
update
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/data_prep.R | 128 | ||||
| -rw-r--r-- | scripts/modelling.R | 39 | ||||
| -rw-r--r-- | scripts/modelling_eval.R | 175 | ||||
| -rw-r--r-- | scripts/modelling_exploration.R | 443 | ||||
| -rw-r--r-- | scripts/modelling_results.RData | bin | 0 -> 19268467 bytes | |||
| -rw-r--r-- | scripts/modelling_results_withrrlda.RData | bin | 0 -> 1876197 bytes | |||
| -rw-r--r-- | scripts/sam.R | 49 |
7 files changed, 821 insertions, 13 deletions
diff --git a/scripts/data_prep.R b/scripts/data_prep.R index acd9977..30c4d85 100644 --- a/scripts/data_prep.R +++ b/scripts/data_prep.R @@ -1,31 +1,133 @@ library(tidyverse) library(ggpubr) library(knitr) +library(mulset) +library(caret) data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL") data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL") -simon_wide <- data_simon %>% +simon_agg <- data_simon %>% + group_by(donor_id, data_name) %>% + summarise( + data = mean(data) + ) + +simon_outcome <- data_simon %>% + select(donor_id, outcome) %>% group_by(donor_id) %>% summarise( - dup = duplicated(data_name) - ) %>% - filter(dup) -simon_wide + outcome = unique(outcome) + ) %>% + ungroup() %>% + arrange(donor_id) +simon_outcome + + +simon_wide <- simon_agg %>% + pivot_wider( + id_cols = donor_id, + names_from = data_name, + values_from = data, + ) %>% + ungroup() %>% + arrange(donor_id) %>% + mutate(outcome = as.numeric(simon_outcome$outcome)) %>% + select(donor_id, outcome, everything()) + +simon_nas <- sum(is.na(simon_wide)) +simon_cells <- (dim(simon_wide)[1] * dim(simon_wide)[2]) +simon_sparseness <- sum(is.na(simon_wide)) / (dim(simon_wide)[1] * dim(simon_wide)[2]) dview <- data_simon %>% group_by(donor_id) %>% mutate(dup = duplicated(data_name)) %>% filter(dup) %>% arrange(data_name) %>% - filter(donor_id == 285 & data_name == "CD4_pos_T_cells") %>% - kable(format = "latex", booktabs = TRUE) + filter(donor_id == 285) # & data_name == "CD4_pos_T_cells") + +mike_agg <- data_mike %>% + group_by(donor_id, name_formatted) %>% + summarise( + data = mean(data) + ) -%>% - rowid_to_column() %>% +mike_wide <- mike_agg %>% pivot_wider( - id_cols = donor_id, - names_from = data_name, - values_from = data, + id_cols = donor_id, + names_from = name_formatted, + values_from = data, ) -simon_wide + +mike_nas <- sum(is.na(mike_wide)) +mike_cells <- (dim(mike_wide)[1] * dim(mike_wide)[2]) +mike_sparseness <- sum(is.na(mike_wide)) / (dim(mike_wide)[1] * dim(mike_wide)[2]) + +simon_mulset <- mulset(simon_wide) + +# set1 <- simon_mulset$`1` +# set1$features_hash +# set1$feature_count +# set1$features +# set1$samples +# set1$samples_count +# set1$datapoints + +sets <- list() +count <- 1 +for (set in simon_mulset) { + if (set$feature_count >= 5 & set$samples_count >= 15) { + sets[[count]] <- simon_wide[set$features] %>% + drop_na() %>% + select(donor_id, outcome, everything()) + print(paste("rows:", nrow(sets[[count]]), ", samples by mulset:", set$samples_count)) + count <- count + 1 + } +} + + +sets_partitions <- list() +set.seed(13121994) +count <- 1 +for (set in sets) { + set$outcome <- as.factor(set$outcome) + partitions <- createDataPartition(set$outcome, p = 0.75) + training_rows <- partitions$Resample1 + train <- set[training_rows, ] + test <- set[-training_rows, ] + if (nrow(test) >= 10) { + sets_partitions[[count]] <- list() + sets_partitions[[count]][["donors"]] <- set[['donor_id']] + sets_partitions[[count]][["train"]] <- train + sets_partitions[[count]][["test"]] <- test + sets_partitions[[count]][["totalOutcomes"]] <- table(set$outcome) + sets_partitions[[count]][["trainingOutcomes"]] <- table(train$outcome) + sets_partitions[[count]][["trainingRows"]] <- nrow(train) + sets_partitions[[count]][["testOutcomes"]] <- table(test$outcome) + sets_partitions[[count]][["testRows"]] <- nrow(test) + count <- count + 1 + } +} + + +tbl <- tibble(dataset = c(), `Rows x Cols` = c(), `total (low / high)` = c(), `train (low / high)` = c(), `test (low / high)` = c()) +count = 1 +for (set in sets_partitions) { + rows <- nrow(set[["train"]]) + nrow(set[["test"]]) + cols <- ncol(set[["train"]]) + rowsxcols <- paste(rows, "x", cols) + tot <- paste(set[["totalOutcomes"]][1], "/", set[["totalOutcomes"]][2], "(", round((set[["totalOutcomes"]][1]) / rows, 2) , ")") + train <- paste(set[["trainingOutcomes"]][1], "/", set[["trainingOutcomes"]][2]) + test <- paste(set[["testOutcomes"]][1], "/", set[["testOutcomes"]][2]) + tbl <- tbl %>% add_row( + dataset = count, + `Rows x Cols` = rowsxcols, + `total (low / high)` = tot, + `train (low / high)` = train, + `test (low / high)` = test + ) + count = count + 1 +} +tbl %>% +kable(format = "latex", booktabs = TRUE) + diff --git a/scripts/modelling.R b/scripts/modelling.R new file mode 100644 index 0000000..b1fc746 --- /dev/null +++ b/scripts/modelling.R @@ -0,0 +1,39 @@ +library(caret) +library(tidyverse) +library(MLeval) + +source("./data_prep.R") + +data_list <- sets_partitions +results <- list() +models <- c("rrlda", "naive_bayes", "rf", "regLogistic") +fitControl <- trainControl( ## 10-fold CV + method = "repeatedcv", + number = 10, + classProbs = TRUE, + savePredictions = TRUE, + repeats = 2 +) +for (model in models) { + dataset = 1 + # for (data in data_list) { + for (data in data_list[c(14, 16, 19)]) { + print(paste("Training", model, "on dataset", dataset)) + train <- data[["train"]] + X_train <- as.data.frame(train[-c(1, 2)]) + Y_train <- train[c(2)][[1]] + levels(Y_train) <- c("Low", "High") + set.seed(13121994) + model_trained <- train( + X_train, + y = Y_train, + method = model, + trControl = fitControl + ) + results[[model]][[dataset]] <- model_trained + dataset = dataset + 1 + } +} +save(results, file="./modelling_results_withrrlda.RData") +# save(results, file="./modelling_results.RData") + diff --git a/scripts/modelling_eval.R b/scripts/modelling_eval.R new file mode 100644 index 0000000..1c99db7 --- /dev/null +++ b/scripts/modelling_eval.R @@ -0,0 +1,175 @@ +library(tidyverse) +library(caret) +library(MLeval) +library(pROC) + + +source("./data_prep.R") +# load("./modelling_results.RData") +load("modelling_results_withrrlda.RData") + +second_visit_donors <- mike_wide[["donor_id"]] +second_visit_donors + +second_visits <- c() +results <- results +data_list <- sets_partitions +donor_list <- list() +feature_list <- list() +count = 1 +for (set in data_list) { + donors <- c(set[["train"]][["donor_id"]], set[['test']][['donor_id']]) + features <- names(set[["train"]]) + donor_list[[count]] <- donors + feature_list[[count]] <- features + second_visits <- c(second_visits, length(intersect(donors, second_visit_donors))) + count = count + 1 +} +second_visits + +donor_list <- donor_list[c(14, 16, 19)] +feature_list <- feature_list[c(14, 16, 19)] + +donor_intersect <- intersection(donor_list[[1]], donor_list[[2]], donor_list[[3]]) +n_donor_intersect <- length(donor_intersect) + +feature_intersect1 <- intersection(feature_list[[1]], feature_list[[2]]) +n_feature_intersect1 <- length(feature_intersect1) + +# which(second_visits > 15) +# which.max(second_visits) + +# dataset14_rrlda_eval <- evalm(results[["rrlda"]][[14]]) +# dataset14_nb_eval <- evalm(results[["naive_bayes"]][[14]]) +# dataset14_rf_eval <- evalm(results[["rf"]][[14]]) +# dataset14_reglog_eval <- evalm(results[["regLogistic"]][[14]]) +# +# dataset16_rrlda_eval <- evalm(results[["rrlda"]][[16]]) +# dataset16_nb_eval <- evalm(results[["naive_bayes"]][[16]]) +# dataset16_rf_eval <- evalm(results[["rf"]][[16]]) +# dataset16_reglog_eval <- evalm(results[["regLogistic"]][[16]]) +# +# dataset19_rrlda_eval <- evalm(results[["rrlda"]][[19]]) +# dataset19_nb_eval <- evalm(results[["naive_bayes"]][[19]]) +# dataset19_rf_eval <- evalm(results[["rf"]][[19]]) +# dataset19_reglog_eval <- evalm(results[["regLogistic"]][[19]]) + +dataset1_rrlda_eval <- evalm(results[["rrlda"]][[1]]) +dataset1_nb_eval <- evalm(results[["naive_bayes"]][[1]]) +dataset1_rf_eval <- evalm(results[["rf"]][[1]]) +dataset1_reglog_eval <- evalm(results[["regLogistic"]][[1]]) + +dataset2_rrlda_eval <- evalm(results[["rrlda"]][[2]]) +dataset2_nb_eval <- evalm(results[["naive_bayes"]][[2]]) +dataset2_rf_eval <- evalm(results[["rf"]][[2]]) +dataset2_reglog_eval <- evalm(results[["regLogistic"]][[2]]) + +dataset3_rrlda_eval <- evalm(results[["rrlda"]][[3]]) +dataset3_nb_eval <- evalm(results[["naive_bayes"]][[3]]) +dataset3_rf_eval <- evalm(results[["rf"]][[3]]) +dataset3_reglog_eval <- evalm(results[["regLogistic"]][[3]]) + + +get_test_auc <- function(dataset, model) { + used_datasets <- c(14,16,19) + test_data <- data_list[[used_datasets[dataset]]][["test"]] + print(test_data) + model_res <- results[[model]][[dataset]] + preds <- predict(model_res, newdata = test_data[-c(1, 2)]) + preds <- as.numeric(preds) + auc_score <- auc(roc( + response = test_data[["outcome"]], + predictor = preds, + ret = c("roc") + )) + round(as.numeric(auc_score), 2) +} + + + +d1_rrlda_met <- as_tibble(dataset1_rrlda_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "rrlda") +d1_nb_met <- as_tibble(dataset1_nb_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "nb") +d1_rf_met <- as_tibble(dataset1_rf_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "rf") +d1_reglog_met <- as_tibble(dataset1_reglog_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "reglog") +d1_met <- bind_rows(list(d1_rrlda_met, d1_nb_met, d1_rf_met, d1_reglog_met)) %>% + mutate( + `Score (CI)` = paste(Score, "(", CI, ")") + ) %>% + pivot_wider( + id_cols = model, + values_from = Score, + names_from = Metric + ) %>% + mutate( + `test_AUC` = c( + get_test_auc(1, "rrlda"), + get_test_auc(1, "naive_bayes"), + get_test_auc(1, "rf"), + get_test_auc(1, "regLogistic") + ) + ) %>% + select(-`AUC-PRG`, -`AUC-PR`, -Informedness) %>% + kable(format = "latex", booktabs = TRUE) +d1_met + +d2_rrlda_met <- as_tibble(dataset2_rrlda_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "rrlda") +d2_nb_met <- as_tibble(dataset2_nb_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "nb") +d2_rf_met <- as_tibble(dataset2_rf_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "rf") +d2_reglog_met <- as_tibble(dataset2_reglog_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "reglog") +d2_met <- bind_rows(list(d2_rrlda_met, d2_nb_met, d2_rf_met, d2_reglog_met)) %>% + mutate( + `Score (CI)` = paste(Score, "(", CI, ")") + ) %>% + pivot_wider( + id_cols = model, + values_from = Score, + names_from = Metric + ) %>% + mutate( + `test_AUC` = c( + get_test_auc(2, "rrlda"), + get_test_auc(2, "naive_bayes"), + get_test_auc(2, "rf"), + get_test_auc(2, "regLogistic") + ) + ) %>% + select(-`AUC-PRG`, -`AUC-PR`, -Informedness) %>% + kable(format = "latex", booktabs = TRUE) +d2_met + +d3_rrlda_met <- as_tibble(dataset3_rrlda_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "rrlda") +d3_nb_met <- as_tibble(dataset3_nb_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "nb") +d3_rf_met <- as_tibble(dataset3_rf_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "rf") +d3_reglog_met <- as_tibble(dataset3_reglog_eval$stdres$`Group 1`, rownames = "Metric") %>% + mutate(model = "reglog") +d3_met <- bind_rows(list(d3_rrlda_met, d3_nb_met, d3_rf_met, d3_reglog_met)) %>% + mutate( + `Score (CI)` = paste(Score, "(", CI, ")") + ) %>% + pivot_wider( + id_cols = model, + values_from = Score, + names_from = Metric + ) %>% + mutate( + `test_AUC` = c( + get_test_auc(3, "rrlda"), + get_test_auc(3, "naive_bayes"), + get_test_auc(3, "rf"), + get_test_auc(3, "regLogistic") + ) + ) %>% + select(-`AUC-PRG`, -`AUC-PR`, -Informedness) %>% + kable(format = "latex", booktabs = TRUE) +d3_met diff --git a/scripts/modelling_exploration.R b/scripts/modelling_exploration.R new file mode 100644 index 0000000..762497f --- /dev/null +++ b/scripts/modelling_exploration.R @@ -0,0 +1,443 @@ +library(tidyverse) +library(caret) +library(ggpubr) +library(corrplot) +library(psych) + +load("modelling_results_withrrlda.RData") +source("data_prep.R") +source("data_prep_desc.R") + +results <- results +important_features <- list() +for (model in results) { + for (dataset in c(1,2,3)) { + model_name <- model[[dataset]]$method + # important_features[[model_name]] <- list() + important_features[[model_name]][[dataset]] <- varImp(model[[dataset]]) + } +} + +important_features$naive_bayes[[1]] +important_features$naive_bayes[[2]] + +nb_importance1 <- important_features$naive_bayes[[1]]$importance %>% + as_tibble(rownames = 'Feature') %>% + mutate(Score = High) %>% + select(Feature, Score) %>% + filter(Score >= 50) %>% + arrange(Score) %>% + mutate(Feature = factor(Feature, Feature)) +nb_importance1 + +nb_importance2 <- important_features$naive_bayes[[2]]$importance %>% + as_tibble(rownames = 'Feature') %>% + mutate(Score = High) %>% + select(Feature, Score) %>% + filter(Score >= 50) %>% + arrange(Score) %>% + mutate(Feature = factor(Feature, Feature)) +nb_importance2 + +importance_plt1 <- nb_importance1 %>% + ggplot(aes(Score, Feature)) + + geom_bar(stat="identity", fill = "green4") + + theme_pubclean() +importance_plt1 + +importance_plt2 <- nb_importance2 %>% + ggplot(aes(Score, Feature)) + + geom_bar(stat="identity", fill = "green4") + + theme_pubclean() +importance_plt2 + + +top3_features1 <- nb_importance1 %>% + arrange(desc(Score)) %>% + slice(1:3,) %>% + select(Feature) + +top3_features1 <- as.character(top3_features1[['Feature']]) + +top3_features2 <- nb_importance2 %>% + arrange(desc(Score)) %>% + slice(1:3,) %>% + select(Feature) + +top3_features2 <- as.character(top3_features2[['Feature']]) + +data1 <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>% + select(donor_id, outcome, top3_features1) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>% + pivot_longer( + cols = top3_features1, + names_to = "Feature" + ) %>% + group_by(outcome, Feature) %>% + summarise( + Feature = Feature, + outcome_dummy = outcome_dummy, + donor_id = donor_id, + value = value + ) +xlabels_1 <- data1 %>% + group_map( + ~ { + rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.)) + } + ) +data1 <- data1 %>% + ungroup() %>% + mutate(label = unlist(xlabels_1)) + + + +f <- function(x, height = 1) { + ans <- median(x) + data.frame(ymin = ans - height / 2, ymax = ans + height / 2, y = ans) +} + +# 90th percentile values +plt1_top3 <- data1 %>% + ggplot(aes(label, value, fill=outcome)) + + geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") + +data2 <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>% + select(donor_id, outcome, top3_features2) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H"))) %>% + pivot_longer( + cols = top3_features2, + names_to = "Feature" + ) %>% + group_by(outcome, Feature) %>% + summarise( + Feature = Feature, + outcome_dummy = outcome_dummy, + donor_id = donor_id, + value = value + ) +xlabels_2 <- data2 %>% + group_map( + ~ { + rep(paste(.$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.)) + } + ) +data2 <- data2 %>% + ungroup() %>% + mutate(label = unlist(xlabels_2)) + + + +# 90th percentile values +plt2_top3 <- data2 %>% + ggplot(aes(label, value, fill=outcome)) + + geom_violin(aes(color = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") +plt2_top3 + + +############# Second visit data here ################## + +## DATASET 2 + +data2_second <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data2$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value, visit) %>% + bind_rows(data2 %>% + mutate(visit = rep("First", nrow(.))) %>% + select(donor_id, outcome, Feature, value, visit) %>% + filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id)) + ) +data2_second + +data2_second_top3 <- data2_second %>% + filter(Feature %in% top3_features2) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H")), + visit_dummy = visit) %>% + group_by(donor_id) %>% + arrange(donor_id) %>% + filter(1 < length(unique(visit))) %>% + ungroup() %>% + group_by(visit, outcome) +data2_second_top3 + +xlabels_2_second_top3 <- data2_second_top3 %>% + group_map( + ~ { + paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")") + } + ) +xlabels_2_second_top3 + +data2_second_top3 <- data2_second_top3 %>% + ungroup() %>% + mutate( + label = if_else(outcome == 1, + xlabels_2_second_top3[[2]], + if_else(visit == "First", + xlabels_2_second_top3[[1]], + xlabels_2_second_top3[[3]] + ) + ) + ) %>% + filter(value < 1000 & 0 < value) %>% + group_by(donor_id) %>% + mutate(changed = 1 < length(unique(outcome))) +data2_second_top3 +# 6 outliers + +# 90th percentile values +plt2_second_top3 <- data2_second_top3 %>% + ggplot(aes(label, value)) + + geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") +plt2_second_top3 + +######################################## dataset 1 +data1_second <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data1$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value, visit) %>% + bind_rows(data1 %>% + mutate(visit = rep("First", nrow(.))) %>% + select(donor_id, outcome, Feature, value, visit) %>% + filter(donor_id %in% intersection(mike_second_visit$donor_id, .$donor_id)) + ) +data1_second + +data1_second_top3 <- data1_second %>% + filter(Feature %in% top3_features1) %>% + mutate(outcome_dummy = factor(outcome, labels = c("L", "H")), + visit_dummy = visit) %>% + group_by(donor_id) %>% + filter(1 < length(unique(visit))) %>% + arrange(donor_id) %>% + ungroup() %>% + group_by(visit, outcome) +data1_second_top3 +xlabels_1_second_top3 <- data1_second_top3 %>% + group_map( + ~ { + paste(.$visit_dummy[1], .$outcome_dummy[1], "\nn=(", length(unique(.$donor_id)), ")") + } + ) +xlabels_1_second_top3 +data1_second_top3 <- data1_second_top3 %>% + ungroup() %>% + mutate( + label = if_else(outcome == 1, + xlabels_1_second_top3[[2]], + if_else(visit == "First", + xlabels_1_second_top3[[1]], + xlabels_1_second_top3[[3]] + ) + ) + ) %>% + filter(value < 1000 & 0 < value) %>% + group_by(donor_id) %>% + mutate(changed = 1 < length(unique(outcome))) +data1_second_top3 + +# 90th percentile values +plt1_second_top3 <- data1_second_top3 %>% + ggplot(aes(label, value)) + + geom_violin(aes(color = outcome, fill = outcome), alpha = 0.4, show.legend = F) + + facet_wrap(~ Feature, scales="free") + + geom_boxplot(width = 0.1, show.legend = F) + + geom_point(aes(fill = outcome, size = changed), color = "black", shape = 23, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + theme_pubclean() + + labs(x = "", y = "PBMC 90th percentile value") +plt1_second_top3 + + +### Arranging the plot + + +data1_top <- ggarrange( + importance_plt1, + plt1_top3, + widths = c(1,2), + labels = c("A", "B") +) +data1_bot <- ggarrange( + plt1_second_top3, + labels = "C" + ) +whole_data1 <- ggarrange( + data1_top, + data1_bot, + nrow = 2 + ) +whole_data1 +ggsave("../images/dataset1_nb_feature_exploration.png", whole_data1, width = 2 * 15, height = 19, dpi=300, units = "cm") + + +data2_top <- ggarrange( + importance_plt2, + plt2_top3, + widths = c(1,2), + labels = c("A", "B") + ) +data2_bot <- ggarrange( + plt2_second_top3, + labels = c("C") + ) +whole_data2 <- ggarrange( + data2_top, + data2_bot, + nrow = 2 + ) +whole_data2 +ggsave("../images/dataset2_nb_feature_exploration.png", whole_data2, width = 2 * 15, height = 19, dpi=300, units = "cm") + + +#################### correlations data 1 + +data1_wide <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>% + select(-donor_id, -outcome) +data1_wide + +M <- cor(data1_wide) +# pmat <- cor.mtest(data1_wide) +pmat <- psych::corr.test(M, method="pearson", adjust="BH")$p + +png(file = "../images/cor_dataset1.png", + height = 1.5 * 19, + width = 2 * 15, + res = 300, + units = "cm" + ) +corrplot(M, order="hclust", p.mat=pmat, sig.level=0.0001, insig="blank") +dev.off() + +#################### correlations data 2 + +data2_wide <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>% + select(-donor_id, -outcome) +data2_wide + +M <- cor(data2_wide) +# pmat <- cor.mtest(data2_wide) +pmat <- psych::corr.test(M, method="pearson", adjust="none")$p + +png(file = "../images/cor_dataset2.png", + height = 1.5 * 19, + width = 2 * 15, + res = 300, + units = "cm" + ) +corrplot(M, order="hclust", p.mat=pmat, sig.level=0.05, insig="blank") +dev.off() + +#################### difference heatmap data 1 + +after <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data1$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value) %>% + filter(Feature %in% names(data1_wide)) %>% + arrange(donor_id, Feature) +before <- bind_rows(sets_partitions[[14]][['test']], sets_partitions[[14]][['train']]) %>% + pivot_longer( + cols = c(3:last_col()), + names_to = "Feature" + ) %>% + select(donor_id, Feature, value) %>% + filter(donor_id %in% after$donor_id) %>% + arrange(donor_id, Feature) +diff1 <- after %>% + mutate(before_value = before$value) %>% + filter(0 < value) %>% + mutate(top3 = if_else(Feature %in% top3_features1, 1, if_else(Feature %in% top3_features2, 2, 0))) %>% + mutate(value = log2(value / before_value)) %>% + mutate(top3 =factor(top3, labels=c("Not in top3", "Top3 dataset 14", "Top3 dataset 16"))) %>% + ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) + + geom_violin() + + theme_pubclean() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + labs(x = "PBMC features", y = "log2 change", fill = "") +diff1 +ggsave("../images/second_visit_change1.png", diff1, width = 2 * 15, height = 19, dpi=300, units = "cm") + +#################### difference heatmap data 2 + +after <- mike_second_visit %>% + ungroup() %>% + filter(donor_id %in% data2$donor_id) %>% + mutate( + Feature = name_formatted, + value = data, + visit = rep("Second", nrow(.)), + outcome = factor(outcome) + ) %>% + select(donor_id, outcome, Feature, value) %>% + filter(Feature %in% names(data2_wide)) %>% + arrange(donor_id, Feature) +before <- bind_rows(sets_partitions[[16]][['test']], sets_partitions[[16]][['train']]) %>% + pivot_longer( + cols = c(3:last_col()), + names_to = "Feature" + ) %>% + select(donor_id, Feature, value) %>% + filter(donor_id %in% after$donor_id) %>% + arrange(donor_id, Feature) +diff2 <- after %>% + mutate(before_value = before$value) %>% + filter(0 < value) %>% + mutate(top3 = Feature %in% top3_features1) %>% + mutate(value = log2(value / before_value)) %>% + ggplot(aes(reorder(factor(Feature), value, mean), value, fill=top3)) + + geom_violin(show.legend=F) + + theme_pubclean() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + labs(x = "PBMC features", y = "Unsigned log2 change") +diff2 +ggsave("../images/second_visit_change2.png", diff2, width = 2 * 15, height = 19, dpi=300, units = "cm") diff --git a/scripts/modelling_results.RData b/scripts/modelling_results.RData Binary files differnew file mode 100644 index 0000000..0b666c5 --- /dev/null +++ b/scripts/modelling_results.RData diff --git a/scripts/modelling_results_withrrlda.RData b/scripts/modelling_results_withrrlda.RData Binary files differnew file mode 100644 index 0000000..bd5e6e3 --- /dev/null +++ b/scripts/modelling_results_withrrlda.RData diff --git a/scripts/sam.R b/scripts/sam.R new file mode 100644 index 0000000..d94794d --- /dev/null +++ b/scripts/sam.R @@ -0,0 +1,49 @@ +library(samr) +library(tidyverse) + +# assumes modelling exploration is loaded in the R session +data1_sam <- data1 %>% + select(donor_id, outcome, Feature, value) %>% + mutate(outcome = factor(outcome, labels = c(1, 2))) %>% + pivot_wider( + names_from = Feature, + values_from = value + ) %>% + select(-donor_id) + +data1_sam_y <- data1_sam[['outcome']] +data1_sam_x <- t(as.data.frame(data1_sam[-1])) + +samobj1 <- samr::SAM( + data1_sam_x, + data1_sam_y, + resp.type="Two class unpaired", + fdr.output = 0.5, + nperms = 1000, + genenames = rownames(data1_sam_x) +) +samobj1 + + +############# DATASET 2 +data2_sam <- data2 %>% + select(donor_id, outcome, Feature, value) %>% + mutate(outcome = factor(outcome, labels = c(1, 2))) %>% + pivot_wider( + names_from = Feature, + values_from = value + ) %>% + select(-donor_id) + +data2_sam_y <- data2_sam[['outcome']] +data2_sam_x <- t(as.data.frame(data2_sam[-1])) + +samobj2 <- samr::SAM( + data2_sam_x, + data2_sam_y, + resp.type="Two class unpaired", + fdr.output = 0.01, + nperms = 1000, + genenames = rownames(data2_sam_x) +) +samobj2 |
