1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
library(tidyverse)
library(ggpubr)
library(knitr)
library(mulset)
library(caret)
data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL")
data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL")
simon_agg <- data_simon %>%
group_by(donor_id, data_name) %>%
summarise(
data = mean(data)
)
simon_outcome <- data_simon %>%
select(donor_id, outcome) %>%
group_by(donor_id) %>%
summarise(
outcome = unique(outcome)
) %>%
ungroup() %>%
arrange(donor_id)
simon_outcome
simon_wide <- simon_agg %>%
pivot_wider(
id_cols = donor_id,
names_from = data_name,
values_from = data,
) %>%
ungroup() %>%
arrange(donor_id) %>%
mutate(outcome = as.numeric(simon_outcome$outcome)) %>%
select(donor_id, outcome, everything())
simon_nas <- sum(is.na(simon_wide))
simon_cells <- (dim(simon_wide)[1] * dim(simon_wide)[2])
simon_sparseness <- sum(is.na(simon_wide)) / (dim(simon_wide)[1] * dim(simon_wide)[2])
dview <- data_simon %>%
group_by(donor_id) %>%
mutate(dup = duplicated(data_name)) %>%
filter(dup) %>%
arrange(data_name) %>%
filter(donor_id == 285) # & data_name == "CD4_pos_T_cells")
mike_agg <- data_mike %>%
group_by(donor_id, name_formatted) %>%
summarise(
data = mean(data)
)
mike_wide <- mike_agg %>%
pivot_wider(
id_cols = donor_id,
names_from = name_formatted,
values_from = data,
)
mike_nas <- sum(is.na(mike_wide))
mike_cells <- (dim(mike_wide)[1] * dim(mike_wide)[2])
mike_sparseness <- sum(is.na(mike_wide)) / (dim(mike_wide)[1] * dim(mike_wide)[2])
simon_mulset <- mulset(simon_wide)
# set1 <- simon_mulset$`1`
# set1$features_hash
# set1$feature_count
# set1$features
# set1$samples
# set1$samples_count
# set1$datapoints
sets <- list()
count <- 1
for (set in simon_mulset) {
if (set$feature_count >= 5 & set$samples_count >= 15) {
sets[[count]] <- simon_wide[set$features] %>%
drop_na() %>%
select(donor_id, outcome, everything())
print(paste("rows:", nrow(sets[[count]]), ", samples by mulset:", set$samples_count))
count <- count + 1
}
}
sets_partitions <- list()
set.seed(13121994)
count <- 1
for (set in sets) {
set$outcome <- as.factor(set$outcome)
partitions <- createDataPartition(set$outcome, p = 0.75)
training_rows <- partitions$Resample1
train <- set[training_rows, ]
test <- set[-training_rows, ]
if (nrow(test) >= 10) {
sets_partitions[[count]] <- list()
sets_partitions[[count]][["donors"]] <- set[['donor_id']]
sets_partitions[[count]][["train"]] <- train
sets_partitions[[count]][["test"]] <- test
sets_partitions[[count]][["totalOutcomes"]] <- table(set$outcome)
sets_partitions[[count]][["trainingOutcomes"]] <- table(train$outcome)
sets_partitions[[count]][["trainingRows"]] <- nrow(train)
sets_partitions[[count]][["testOutcomes"]] <- table(test$outcome)
sets_partitions[[count]][["testRows"]] <- nrow(test)
count <- count + 1
}
}
tbl <- tibble(dataset = c(), `Rows x Cols` = c(), `total (low / high)` = c(), `train (low / high)` = c(), `test (low / high)` = c())
count = 1
for (set in sets_partitions) {
rows <- nrow(set[["train"]]) + nrow(set[["test"]])
cols <- ncol(set[["train"]])
rowsxcols <- paste(rows, "x", cols)
tot <- paste(set[["totalOutcomes"]][1], "/", set[["totalOutcomes"]][2], "(", round((set[["totalOutcomes"]][1]) / rows, 2) , ")")
train <- paste(set[["trainingOutcomes"]][1], "/", set[["trainingOutcomes"]][2])
test <- paste(set[["testOutcomes"]][1], "/", set[["testOutcomes"]][2])
tbl <- tbl %>% add_row(
dataset = count,
`Rows x Cols` = rowsxcols,
`total (low / high)` = tot,
`train (low / high)` = train,
`test (low / high)` = test
)
count = count + 1
}
tbl %>%
kable(format = "latex", booktabs = TRUE)
|