summaryrefslogtreecommitdiff
path: root/scripts/data_prep.R
diff options
context:
space:
mode:
authorMike Vink <mike1994vink@gmail.com>2021-05-02 17:33:26 +0200
committerMike Vink <mike1994vink@gmail.com>2021-05-02 17:33:26 +0200
commitbf1adece8aeb48e136085233d2f5ff2f9600eaf5 (patch)
tree6a46b0c7e7fbea6a85c0e44714e0076251e82cac /scripts/data_prep.R
parentde4565fe9290ec1f1031eed6f7d067794df53166 (diff)
update
Diffstat (limited to 'scripts/data_prep.R')
-rw-r--r--scripts/data_prep.R128
1 files changed, 115 insertions, 13 deletions
diff --git a/scripts/data_prep.R b/scripts/data_prep.R
index acd9977..30c4d85 100644
--- a/scripts/data_prep.R
+++ b/scripts/data_prep.R
@@ -1,31 +1,133 @@
library(tidyverse)
library(ggpubr)
library(knitr)
+library(mulset)
+library(caret)
data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL")
data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL")
-simon_wide <- data_simon %>%
+simon_agg <- data_simon %>%
+ group_by(donor_id, data_name) %>%
+ summarise(
+ data = mean(data)
+ )
+
+simon_outcome <- data_simon %>%
+ select(donor_id, outcome) %>%
group_by(donor_id) %>%
summarise(
- dup = duplicated(data_name)
- ) %>%
- filter(dup)
-simon_wide
+ outcome = unique(outcome)
+ ) %>%
+ ungroup() %>%
+ arrange(donor_id)
+simon_outcome
+
+
+simon_wide <- simon_agg %>%
+ pivot_wider(
+ id_cols = donor_id,
+ names_from = data_name,
+ values_from = data,
+ ) %>%
+ ungroup() %>%
+ arrange(donor_id) %>%
+ mutate(outcome = as.numeric(simon_outcome$outcome)) %>%
+ select(donor_id, outcome, everything())
+
+simon_nas <- sum(is.na(simon_wide))
+simon_cells <- (dim(simon_wide)[1] * dim(simon_wide)[2])
+simon_sparseness <- sum(is.na(simon_wide)) / (dim(simon_wide)[1] * dim(simon_wide)[2])
dview <- data_simon %>%
group_by(donor_id) %>%
mutate(dup = duplicated(data_name)) %>%
filter(dup) %>%
arrange(data_name) %>%
- filter(donor_id == 285 & data_name == "CD4_pos_T_cells") %>%
- kable(format = "latex", booktabs = TRUE)
+ filter(donor_id == 285) # & data_name == "CD4_pos_T_cells")
+
+mike_agg <- data_mike %>%
+ group_by(donor_id, name_formatted) %>%
+ summarise(
+ data = mean(data)
+ )
-%>%
- rowid_to_column() %>%
+mike_wide <- mike_agg %>%
pivot_wider(
- id_cols = donor_id,
- names_from = data_name,
- values_from = data,
+ id_cols = donor_id,
+ names_from = name_formatted,
+ values_from = data,
)
-simon_wide
+
+mike_nas <- sum(is.na(mike_wide))
+mike_cells <- (dim(mike_wide)[1] * dim(mike_wide)[2])
+mike_sparseness <- sum(is.na(mike_wide)) / (dim(mike_wide)[1] * dim(mike_wide)[2])
+
+simon_mulset <- mulset(simon_wide)
+
+# set1 <- simon_mulset$`1`
+# set1$features_hash
+# set1$feature_count
+# set1$features
+# set1$samples
+# set1$samples_count
+# set1$datapoints
+
+sets <- list()
+count <- 1
+for (set in simon_mulset) {
+ if (set$feature_count >= 5 & set$samples_count >= 15) {
+ sets[[count]] <- simon_wide[set$features] %>%
+ drop_na() %>%
+ select(donor_id, outcome, everything())
+ print(paste("rows:", nrow(sets[[count]]), ", samples by mulset:", set$samples_count))
+ count <- count + 1
+ }
+}
+
+
+sets_partitions <- list()
+set.seed(13121994)
+count <- 1
+for (set in sets) {
+ set$outcome <- as.factor(set$outcome)
+ partitions <- createDataPartition(set$outcome, p = 0.75)
+ training_rows <- partitions$Resample1
+ train <- set[training_rows, ]
+ test <- set[-training_rows, ]
+ if (nrow(test) >= 10) {
+ sets_partitions[[count]] <- list()
+ sets_partitions[[count]][["donors"]] <- set[['donor_id']]
+ sets_partitions[[count]][["train"]] <- train
+ sets_partitions[[count]][["test"]] <- test
+ sets_partitions[[count]][["totalOutcomes"]] <- table(set$outcome)
+ sets_partitions[[count]][["trainingOutcomes"]] <- table(train$outcome)
+ sets_partitions[[count]][["trainingRows"]] <- nrow(train)
+ sets_partitions[[count]][["testOutcomes"]] <- table(test$outcome)
+ sets_partitions[[count]][["testRows"]] <- nrow(test)
+ count <- count + 1
+ }
+}
+
+
+tbl <- tibble(dataset = c(), `Rows x Cols` = c(), `total (low / high)` = c(), `train (low / high)` = c(), `test (low / high)` = c())
+count = 1
+for (set in sets_partitions) {
+ rows <- nrow(set[["train"]]) + nrow(set[["test"]])
+ cols <- ncol(set[["train"]])
+ rowsxcols <- paste(rows, "x", cols)
+ tot <- paste(set[["totalOutcomes"]][1], "/", set[["totalOutcomes"]][2], "(", round((set[["totalOutcomes"]][1]) / rows, 2) , ")")
+ train <- paste(set[["trainingOutcomes"]][1], "/", set[["trainingOutcomes"]][2])
+ test <- paste(set[["testOutcomes"]][1], "/", set[["testOutcomes"]][2])
+ tbl <- tbl %>% add_row(
+ dataset = count,
+ `Rows x Cols` = rowsxcols,
+ `total (low / high)` = tot,
+ `train (low / high)` = train,
+ `test (low / high)` = test
+ )
+ count = count + 1
+}
+tbl %>%
+kable(format = "latex", booktabs = TRUE)
+