From bf1adece8aeb48e136085233d2f5ff2f9600eaf5 Mon Sep 17 00:00:00 2001 From: Mike Vink Date: Sun, 2 May 2021 17:33:26 +0200 Subject: update --- scripts/data_prep.R | 128 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 13 deletions(-) (limited to 'scripts/data_prep.R') diff --git a/scripts/data_prep.R b/scripts/data_prep.R index acd9977..30c4d85 100644 --- a/scripts/data_prep.R +++ b/scripts/data_prep.R @@ -1,31 +1,133 @@ library(tidyverse) library(ggpubr) library(knitr) +library(mulset) +library(caret) data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL") data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL") -simon_wide <- data_simon %>% +simon_agg <- data_simon %>% + group_by(donor_id, data_name) %>% + summarise( + data = mean(data) + ) + +simon_outcome <- data_simon %>% + select(donor_id, outcome) %>% group_by(donor_id) %>% summarise( - dup = duplicated(data_name) - ) %>% - filter(dup) -simon_wide + outcome = unique(outcome) + ) %>% + ungroup() %>% + arrange(donor_id) +simon_outcome + + +simon_wide <- simon_agg %>% + pivot_wider( + id_cols = donor_id, + names_from = data_name, + values_from = data, + ) %>% + ungroup() %>% + arrange(donor_id) %>% + mutate(outcome = as.numeric(simon_outcome$outcome)) %>% + select(donor_id, outcome, everything()) + +simon_nas <- sum(is.na(simon_wide)) +simon_cells <- (dim(simon_wide)[1] * dim(simon_wide)[2]) +simon_sparseness <- sum(is.na(simon_wide)) / (dim(simon_wide)[1] * dim(simon_wide)[2]) dview <- data_simon %>% group_by(donor_id) %>% mutate(dup = duplicated(data_name)) %>% filter(dup) %>% arrange(data_name) %>% - filter(donor_id == 285 & data_name == "CD4_pos_T_cells") %>% - kable(format = "latex", booktabs = TRUE) + filter(donor_id == 285) # & data_name == "CD4_pos_T_cells") + +mike_agg <- data_mike %>% + group_by(donor_id, name_formatted) %>% + summarise( + data = mean(data) + ) -%>% - rowid_to_column() %>% +mike_wide <- mike_agg %>% pivot_wider( - id_cols = donor_id, - names_from = data_name, - values_from = data, + id_cols = donor_id, + names_from = name_formatted, + values_from = data, ) -simon_wide + +mike_nas <- sum(is.na(mike_wide)) +mike_cells <- (dim(mike_wide)[1] * dim(mike_wide)[2]) +mike_sparseness <- sum(is.na(mike_wide)) / (dim(mike_wide)[1] * dim(mike_wide)[2]) + +simon_mulset <- mulset(simon_wide) + +# set1 <- simon_mulset$`1` +# set1$features_hash +# set1$feature_count +# set1$features +# set1$samples +# set1$samples_count +# set1$datapoints + +sets <- list() +count <- 1 +for (set in simon_mulset) { + if (set$feature_count >= 5 & set$samples_count >= 15) { + sets[[count]] <- simon_wide[set$features] %>% + drop_na() %>% + select(donor_id, outcome, everything()) + print(paste("rows:", nrow(sets[[count]]), ", samples by mulset:", set$samples_count)) + count <- count + 1 + } +} + + +sets_partitions <- list() +set.seed(13121994) +count <- 1 +for (set in sets) { + set$outcome <- as.factor(set$outcome) + partitions <- createDataPartition(set$outcome, p = 0.75) + training_rows <- partitions$Resample1 + train <- set[training_rows, ] + test <- set[-training_rows, ] + if (nrow(test) >= 10) { + sets_partitions[[count]] <- list() + sets_partitions[[count]][["donors"]] <- set[['donor_id']] + sets_partitions[[count]][["train"]] <- train + sets_partitions[[count]][["test"]] <- test + sets_partitions[[count]][["totalOutcomes"]] <- table(set$outcome) + sets_partitions[[count]][["trainingOutcomes"]] <- table(train$outcome) + sets_partitions[[count]][["trainingRows"]] <- nrow(train) + sets_partitions[[count]][["testOutcomes"]] <- table(test$outcome) + sets_partitions[[count]][["testRows"]] <- nrow(test) + count <- count + 1 + } +} + + +tbl <- tibble(dataset = c(), `Rows x Cols` = c(), `total (low / high)` = c(), `train (low / high)` = c(), `test (low / high)` = c()) +count = 1 +for (set in sets_partitions) { + rows <- nrow(set[["train"]]) + nrow(set[["test"]]) + cols <- ncol(set[["train"]]) + rowsxcols <- paste(rows, "x", cols) + tot <- paste(set[["totalOutcomes"]][1], "/", set[["totalOutcomes"]][2], "(", round((set[["totalOutcomes"]][1]) / rows, 2) , ")") + train <- paste(set[["trainingOutcomes"]][1], "/", set[["trainingOutcomes"]][2]) + test <- paste(set[["testOutcomes"]][1], "/", set[["testOutcomes"]][2]) + tbl <- tbl %>% add_row( + dataset = count, + `Rows x Cols` = rowsxcols, + `total (low / high)` = tot, + `train (low / high)` = train, + `test (low / high)` = test + ) + count = count + 1 +} +tbl %>% +kable(format = "latex", booktabs = TRUE) + -- cgit v1.2.3