diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2021-04-28 18:21:50 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2021-04-28 18:21:50 +0200 |
| commit | de4565fe9290ec1f1031eed6f7d067794df53166 (patch) | |
| tree | 1211ea302fc77e4ce51a7bd88c6eff8abee3eb85 /scripts | |
| parent | 2676115f77f0052902e1dcc0632420341464373d (diff) | |
begin data prep
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/data_prep.R | 31 | ||||
| -rw-r--r-- | scripts/data_prep_desc.R | 160 | ||||
| -rw-r--r-- | scripts/visit_inconsistencies.R | 1 |
3 files changed, 192 insertions, 0 deletions
diff --git a/scripts/data_prep.R b/scripts/data_prep.R new file mode 100644 index 0000000..acd9977 --- /dev/null +++ b/scripts/data_prep.R @@ -0,0 +1,31 @@ +library(tidyverse) +library(ggpubr) +library(knitr) + +data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL") +data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL") + +simon_wide <- data_simon %>% + group_by(donor_id) %>% + summarise( + dup = duplicated(data_name) + ) %>% + filter(dup) +simon_wide + +dview <- data_simon %>% + group_by(donor_id) %>% + mutate(dup = duplicated(data_name)) %>% + filter(dup) %>% + arrange(data_name) %>% + filter(donor_id == 285 & data_name == "CD4_pos_T_cells") %>% + kable(format = "latex", booktabs = TRUE) + +%>% + rowid_to_column() %>% + pivot_wider( + id_cols = donor_id, + names_from = data_name, + values_from = data, + ) +simon_wide diff --git a/scripts/data_prep_desc.R b/scripts/data_prep_desc.R new file mode 100644 index 0000000..f0983ef --- /dev/null +++ b/scripts/data_prep_desc.R @@ -0,0 +1,160 @@ +library(tidyverse) +library(ggpubr) + +data_simon <- read_csv("../csv/simon_data_extra.csv", na = "NULL") +data_mike <- read_csv("../csv/mike_repeat_visit.csv", na = "NULL") + +f <- function(x, height = 0.05) { + ans <- median(x) + data.frame(ymin = ans - height / 2, ymax = ans + height / 2, y = ans) +} + +simon_year_class <- data_simon %>% + select(donor_id, year, outcome, hai_response) %>% + mutate(outcome = factor(outcome, levels = c(1, 0), labels = c("H", "L"))) %>% + mutate(year_flag = year) %>% + mutate(outcome_flag = outcome) %>% + group_by(year, outcome) %>% + summarise( + year_flag = year_flag, + outcome_flag = outcome_flag, + donor_id = donor_id, + response = hai_response + ) +simon_year_class + + +xlabels <- simon_year_class %>% + group_map( + ~ { + rep(paste(.$outcome_flag[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.)) + } + ) + +count_high_and_low <- data_simon %>% + group_by(outcome) %>% + summarise( + count = length(unique(donor_id)) + ) + +simon_plt <- simon_year_class %>% + ungroup() %>% + mutate(label = unlist(xlabels)) %>% + ggplot(aes(label, log2(response))) + + geom_violin(aes(fill = outcome, color = outcome), alpha = 0.2, show.legend = F) + + geom_boxplot(width = 0.1, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) + + facet_grid(cols = vars(year), scales = "free") + + theme_pubclean() + + labs(x = "", y = "Log2 HAI response difference", title = paste( + "Donor first visit data used in SIMON, total=(", length(unique(simon_year_class$donor_id)), ")", + ", high=(", count_high_and_low[2,2], ")", ", low=(", count_high_and_low[1,2],")" + )) + +mike_repeat_visits <- data_mike %>% + mutate(year_flag = year) %>% + group_by(donor_id, outcome) %>% + summarise( + count = length(unique(year_flag)), + outcome = unique(outcome) + ) %>% + arrange(donor_id) %>% + ungroup() +mike_repeat_visits + +repeat_plt <- mike_repeat_visits %>% + transmute( + second = sum(count >= 1), + third = sum(count >= 2), + fourth = sum(count >= 3), + fifth = sum(count >= 5) + ) %>% + pivot_longer( + names_to = "visit", + values_to = "count", + cols = everything() + ) %>% + slice(1:4, ) %>% + ggplot(aes(factor(visit, levels=c("second", "third", "fourth", "fifth")), count, fill = visit)) + + geom_bar(stat="identity", show.legend=F) + + labs(x = "Repeat visit number", y = "Number of donors") + + theme_pubclean() +repeat_plt + +mike_second_visit <- data_mike %>% + group_by(donor_id) %>% + filter(year == min(year)) +mike_second_visit + +mike_second_visit_year_class <- mike_second_visit %>% + select(donor_id, year, outcome, hai_response) %>% + mutate(outcome = factor(outcome, levels = c(1, 0), labels = c("H", "L"))) %>% + mutate(year_flag = year) %>% + mutate(outcome_flag = outcome) %>% + group_by(year, outcome) %>% + summarise( + year_flag = year_flag, + outcome_flag = outcome_flag, + donor_id = donor_id, + response = hai_response + ) + + +xlabels_mike <- mike_second_visit_year_class %>% + group_map( + ~ { + rep(paste(.$outcome_flag[1], "\nn=(", length(unique(.$donor_id)), ")"), nrow(.)) + } + ) + +count_high_and_low_mike <- mike_second_visit %>% + group_by(outcome) %>% + summarise( + count = length(unique(donor_id)) + ) + +f <- function(x, height = 0.02) { + ans <- median(x) + data.frame(ymin = ans - height / 2, ymax = ans + height / 2, y = ans) +} + + +mike_plt <- mike_second_visit_year_class %>% + ungroup() %>% + mutate(label = unlist(xlabels_mike)) %>% + ggplot(aes(label, log2(response))) + + geom_violin(aes(fill = outcome, color = outcome), alpha = 0.2, show.legend = F) + + geom_boxplot(width = 0.1, show.legend = F) + + stat_summary( + fun.data = f, geom = "crossbar", + colour = NA, fill = "black", width = 0.3, alpha = 1.0 + ) + + geom_point(aes(fill = outcome), color = "black", shape = 23, show.legend = F) + + facet_grid(cols = vars(year), scales = "free") + + theme_pubclean() + + labs(x = "", y = "Log2 HAI response difference", title = paste( + "Donor second visit data used in this work, total=(", length(unique(mike_second_visit_year_class$donor_id)), ")", + ", high=(", count_high_and_low_mike[2,2], ")", ", low=(", count_high_and_low_mike[1,2],")" + )) +mike_plt + +bottom <- ggarrange( + repeat_plt, + mike_plt, + ncol=2, + widths=c(1,2), + labels=c("B", "C") +) + +whole <- ggarrange( + simon_plt, + bottom, + nrow=2, + heights=c(1,1), + labels=c("A", "") + ) +ggsave("../images/data_selection.png", whole, width = 2 * 15, height = 19, dpi=300, units = "cm") diff --git a/scripts/visit_inconsistencies.R b/scripts/visit_inconsistencies.R index c495d9f..df6628d 100644 --- a/scripts/visit_inconsistencies.R +++ b/scripts/visit_inconsistencies.R @@ -30,6 +30,7 @@ post_gmt_vec <- class_correct %>% } } ) + post_gmt_vec post_gmt_vec <- unlist(post_gmt_vec) |
