summaryrefslogtreecommitdiff
path: root/scripts/data_prep.R
blob: 30c4d852b3721084426defa7f934b0dda465ed9f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
library(tidyverse)
library(ggpubr)
library(knitr)
library(mulset)
library(caret)

data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL")
data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL")

simon_agg <- data_simon %>%
    group_by(donor_id, data_name) %>%
    summarise(
        data = mean(data)
    )

simon_outcome <- data_simon %>%
    select(donor_id, outcome) %>%
    group_by(donor_id) %>%
    summarise(
        outcome = unique(outcome)
    ) %>%
    ungroup() %>%
    arrange(donor_id)
simon_outcome


simon_wide <- simon_agg %>%
    pivot_wider(
        id_cols = donor_id,
        names_from = data_name,
        values_from = data,
    ) %>%
    ungroup() %>%
    arrange(donor_id) %>%
    mutate(outcome = as.numeric(simon_outcome$outcome)) %>%
    select(donor_id, outcome, everything())

simon_nas <- sum(is.na(simon_wide))
simon_cells <- (dim(simon_wide)[1] * dim(simon_wide)[2])
simon_sparseness <- sum(is.na(simon_wide)) / (dim(simon_wide)[1] * dim(simon_wide)[2])

dview <- data_simon %>%
    group_by(donor_id) %>%
    mutate(dup = duplicated(data_name)) %>%
    filter(dup) %>%
    arrange(data_name) %>%
    filter(donor_id == 285) # & data_name == "CD4_pos_T_cells")

mike_agg <- data_mike %>%
    group_by(donor_id, name_formatted) %>%
    summarise(
        data = mean(data)
    )

mike_wide <- mike_agg %>%
    pivot_wider(
        id_cols = donor_id,
        names_from = name_formatted,
        values_from = data,
    )

mike_nas <- sum(is.na(mike_wide))
mike_cells <- (dim(mike_wide)[1] * dim(mike_wide)[2])
mike_sparseness <- sum(is.na(mike_wide)) / (dim(mike_wide)[1] * dim(mike_wide)[2])

simon_mulset <- mulset(simon_wide)

# set1 <- simon_mulset$`1`
# set1$features_hash
# set1$feature_count
# set1$features
# set1$samples
# set1$samples_count
# set1$datapoints

sets <- list()
count <- 1
for (set in simon_mulset) {
    if (set$feature_count >= 5 & set$samples_count >= 15) {
        sets[[count]] <- simon_wide[set$features] %>%
            drop_na() %>%
            select(donor_id, outcome, everything())
        print(paste("rows:", nrow(sets[[count]]), ", samples by mulset:", set$samples_count))
        count <- count + 1
    }
}


sets_partitions <- list()
set.seed(13121994)
count <- 1
for (set in sets) {
    set$outcome <- as.factor(set$outcome)
    partitions <- createDataPartition(set$outcome, p = 0.75)
    training_rows <- partitions$Resample1
    train <- set[training_rows, ]
    test <- set[-training_rows, ]
    if (nrow(test) >= 10) {
        sets_partitions[[count]] <- list()
        sets_partitions[[count]][["donors"]] <- set[['donor_id']]
        sets_partitions[[count]][["train"]] <- train
        sets_partitions[[count]][["test"]] <- test
        sets_partitions[[count]][["totalOutcomes"]] <- table(set$outcome)
        sets_partitions[[count]][["trainingOutcomes"]] <- table(train$outcome)
        sets_partitions[[count]][["trainingRows"]] <- nrow(train)
        sets_partitions[[count]][["testOutcomes"]] <- table(test$outcome)
        sets_partitions[[count]][["testRows"]] <- nrow(test)
        count <- count + 1
    }
}


tbl <- tibble(dataset = c(), `Rows x Cols` = c(), `total (low / high)` = c(), `train (low / high)` = c(), `test (low / high)` = c())
count = 1
for (set in sets_partitions) {
    rows <- nrow(set[["train"]]) + nrow(set[["test"]])
    cols <- ncol(set[["train"]])
    rowsxcols <- paste(rows, "x", cols)
    tot <- paste(set[["totalOutcomes"]][1], "/", set[["totalOutcomes"]][2], "(", round((set[["totalOutcomes"]][1]) / rows, 2) , ")")
    train <- paste(set[["trainingOutcomes"]][1], "/", set[["trainingOutcomes"]][2])
    test <- paste(set[["testOutcomes"]][1], "/", set[["testOutcomes"]][2])
    tbl <- tbl %>% add_row(
        dataset = count,
        `Rows x Cols` = rowsxcols,
        `total (low / high)` = tot,
        `train (low / high)` = train,
        `test (low / high)` = test
    )
    count = count + 1
}
tbl %>%
kable(format = "latex", booktabs = TRUE)