From bf1adece8aeb48e136085233d2f5ff2f9600eaf5 Mon Sep 17 00:00:00 2001
From: Mike Vink <mike1994vink@gmail.com>
Date: Sun, 2 May 2021 17:33:26 +0200
Subject: update

---
 scripts/data_prep.R | 128 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 115 insertions(+), 13 deletions(-)

(limited to 'scripts/data_prep.R')

diff --git a/scripts/data_prep.R b/scripts/data_prep.R
index acd9977..30c4d85 100644
--- a/scripts/data_prep.R
+++ b/scripts/data_prep.R
@@ -1,31 +1,133 @@
 library(tidyverse)
 library(ggpubr)
 library(knitr)
+library(mulset)
+library(caret)
 
 data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL")
 data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL")
 
-simon_wide <- data_simon %>%
+simon_agg <- data_simon %>%
+    group_by(donor_id, data_name) %>%
+    summarise(
+        data = mean(data)
+    )
+
+simon_outcome <- data_simon %>%
+    select(donor_id, outcome) %>%
     group_by(donor_id) %>%
     summarise(
-              dup = duplicated(data_name)
-              ) %>%
-    filter(dup)
-simon_wide
+        outcome = unique(outcome)
+    ) %>%
+    ungroup() %>%
+    arrange(donor_id)
+simon_outcome
+
+
+simon_wide <- simon_agg %>%
+    pivot_wider(
+        id_cols = donor_id,
+        names_from = data_name,
+        values_from = data,
+    ) %>%
+    ungroup() %>%
+    arrange(donor_id) %>%
+    mutate(outcome = as.numeric(simon_outcome$outcome)) %>%
+    select(donor_id, outcome, everything())
+
+simon_nas <- sum(is.na(simon_wide))
+simon_cells <- (dim(simon_wide)[1] * dim(simon_wide)[2])
+simon_sparseness <- sum(is.na(simon_wide)) / (dim(simon_wide)[1] * dim(simon_wide)[2])
 
 dview <- data_simon %>%
     group_by(donor_id) %>%
     mutate(dup = duplicated(data_name)) %>%
     filter(dup) %>%
     arrange(data_name) %>%
-    filter(donor_id == 285 & data_name == "CD4_pos_T_cells") %>%
-    kable(format = "latex", booktabs = TRUE)
+    filter(donor_id == 285) # & data_name == "CD4_pos_T_cells")
+
+mike_agg <- data_mike %>%
+    group_by(donor_id, name_formatted) %>%
+    summarise(
+        data = mean(data)
+    )
 
-%>%
-    rowid_to_column() %>%
+mike_wide <- mike_agg %>%
     pivot_wider(
-                id_cols = donor_id,
-                names_from = data_name,
-                values_from = data,
+        id_cols = donor_id,
+        names_from = name_formatted,
+        values_from = data,
     )
-simon_wide
+
+mike_nas <- sum(is.na(mike_wide))
+mike_cells <- (dim(mike_wide)[1] * dim(mike_wide)[2])
+mike_sparseness <- sum(is.na(mike_wide)) / (dim(mike_wide)[1] * dim(mike_wide)[2])
+
+simon_mulset <- mulset(simon_wide)
+
+# set1 <- simon_mulset$`1`
+# set1$features_hash
+# set1$feature_count
+# set1$features
+# set1$samples
+# set1$samples_count
+# set1$datapoints
+
+sets <- list()
+count <- 1
+for (set in simon_mulset) {
+    if (set$feature_count >= 5 & set$samples_count >= 15) {
+        sets[[count]] <- simon_wide[set$features] %>%
+            drop_na() %>%
+            select(donor_id, outcome, everything())
+        print(paste("rows:", nrow(sets[[count]]), ", samples by mulset:", set$samples_count))
+        count <- count + 1
+    }
+}
+
+
+sets_partitions <- list()
+set.seed(13121994)
+count <- 1
+for (set in sets) {
+    set$outcome <- as.factor(set$outcome)
+    partitions <- createDataPartition(set$outcome, p = 0.75)
+    training_rows <- partitions$Resample1
+    train <- set[training_rows, ]
+    test <- set[-training_rows, ]
+    if (nrow(test) >= 10) {
+        sets_partitions[[count]] <- list()
+        sets_partitions[[count]][["donors"]] <- set[['donor_id']]
+        sets_partitions[[count]][["train"]] <- train
+        sets_partitions[[count]][["test"]] <- test
+        sets_partitions[[count]][["totalOutcomes"]] <- table(set$outcome)
+        sets_partitions[[count]][["trainingOutcomes"]] <- table(train$outcome)
+        sets_partitions[[count]][["trainingRows"]] <- nrow(train)
+        sets_partitions[[count]][["testOutcomes"]] <- table(test$outcome)
+        sets_partitions[[count]][["testRows"]] <- nrow(test)
+        count <- count + 1
+    }
+}
+
+
+tbl <- tibble(dataset = c(), `Rows x Cols` = c(), `total (low / high)` = c(), `train (low / high)` = c(), `test (low / high)` = c())
+count = 1
+for (set in sets_partitions) {
+    rows <- nrow(set[["train"]]) + nrow(set[["test"]])
+    cols <- ncol(set[["train"]])
+    rowsxcols <- paste(rows, "x", cols)
+    tot <- paste(set[["totalOutcomes"]][1], "/", set[["totalOutcomes"]][2], "(", round((set[["totalOutcomes"]][1]) / rows, 2) , ")")
+    train <- paste(set[["trainingOutcomes"]][1], "/", set[["trainingOutcomes"]][2])
+    test <- paste(set[["testOutcomes"]][1], "/", set[["testOutcomes"]][2])
+    tbl <- tbl %>% add_row(
+        dataset = count,
+        `Rows x Cols` = rowsxcols,
+        `total (low / high)` = tot,
+        `train (low / high)` = train,
+        `test (low / high)` = test
+    )
+    count = count + 1
+}
+tbl %>%
+kable(format = "latex", booktabs = TRUE)
+
-- 
cgit v1.2.3