diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2021-04-27 18:31:00 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2021-04-27 18:31:00 +0200 |
| commit | 2676115f77f0052902e1dcc0632420341464373d (patch) | |
| tree | ccbcedbcbbb82003003bd0049e6a72e571ada1fc /scripts | |
| parent | 19c3d0dba64d782e519d8ece36028ebb25b33141 (diff) | |
checkpoint 27-04-21
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/assay_ranges.R | 38 | ||||
| -rw-r--r-- | scripts/demographics.R | 71 | ||||
| -rw-r--r-- | scripts/donors_head.R | 29 | ||||
| -rw-r--r-- | scripts/experimental_description.R | 105 | ||||
| -rw-r--r-- | scripts/repeat_investigation.R | 72 | ||||
| -rw-r--r-- | scripts/visit_166_21 | 0 | ||||
| -rw-r--r-- | scripts/visit_inconsistencies.R | 132 | ||||
| -rw-r--r-- | scripts/whole_experimental_table.R | 12 |
8 files changed, 439 insertions, 20 deletions
diff --git a/scripts/assay_ranges.R b/scripts/assay_ranges.R new file mode 100644 index 0000000..df52083 --- /dev/null +++ b/scripts/assay_ranges.R @@ -0,0 +1,38 @@ +library(tidyverse) +library(ggpubr) + +data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv") + +tbl <- data %>% + mutate( + assay_type = factor(assay, labels = ( + c( + "MSD (Arb. Intensity)", + "Multiplex cytokine (log2 Z-score)", + "Cell Cytometry (% parent popul.)", + "Multiplex cytokine (log2 Z-score)", + "Phospho cytometry (flow cytometery,\n 90 %tile fold-change)", + "Cell Cytometry (% parent popul.)", + "Phospho cytometry (Mass cytometry, arcsinh diff.)", + "Complete blood count (abs. count/uL)", + "MSD (Arb. Intensity)", + "Cell Cytometry (% parent popul.)", + "MSD (Arb. Intensity)", + "Multiplex cytokine (log2 Z-score)", + "Multiplex cytokine (log2 Z-score)", + "Cell Cytometry (% parent popul.)" + ) + )) + ) %>% + filter( + !(assay_type=="Phospho cytometry (flow cytometery,\n 90 %tile fold-change)" + & + data > 10000) + ) +plt <- tbl %>% + ggplot(aes(data, fill=assay_type)) + + geom_histogram(show.legend=F) + + facet_wrap(~ factor(assay_type), scales="free") + + theme_pubclean() +plt +ggsave("../images/assay_value_distributions.png", plt, width = 2 * 15, height = 19, units = "cm") diff --git a/scripts/demographics.R b/scripts/demographics.R index d43b453..3aee883 100644 --- a/scripts/demographics.R +++ b/scripts/demographics.R @@ -1,5 +1,26 @@ library(tidyverse) +library(grid) library(ggpubr) +library(patchwork) + + + +p1 <- tibble(x = 1:10, y = 1:10) %>% + ggplot(aes(x, y)) + + geom_point() + + scale_y_reverse(breaks = seq(1, 10)) + + labs(y = NULL) + +p2 <- tibble(ymin = c(0, 2.1, 7.1), ymax = c(1.9, 6.9, 10), fill = c("Gender", "Ethnicity", "CMV status")) %>% + ggplot() + + geom_rect(aes(xmin = 0, xmax = 1, ymin = ymin, ymax = ymax)) + + geom_text(aes(x = .5, y = (ymin + ymax) / 2, label = fill), angle = 90) + + scale_y_reverse(breaks = seq(1, 10), expand = expansion(mult = c(0, 0))) + + scale_x_continuous(breaks = c(0), expand = expansion(mult = c(0, 0))) + + guides(fill = FALSE) + + theme_void() + + orig <- read_csv("../csv/donor_demo.csv") data <- orig tbl <- data %>% @@ -8,7 +29,7 @@ tbl <- data %>% labels = c("CMV-", "CMV+", "CMV unknown") )) %>% na_if("NULL") %>% - replace_na(list(Ethnicity="Unknown")) %>% + replace_na(list(Ethnicity = "Unknown")) %>% pivot_longer( cols = c("Gender", "Ethnicity", "CMV status") ) %>% @@ -19,7 +40,10 @@ tbl <- data %>% count = n(), ) %>% mutate( - total = if_else(Response == 1, sum(data$Response == 1), sum(data$Response == 0)), + total = if_else(Response == 1, + sum(data$Response == 1), + sum(data$Response == 0) + ), value = factor(value, levels = rev(c( "Female", "Male", @@ -33,14 +57,22 @@ tbl <- data %>% "CMV-", "CMV unknown" ))), - ) %>% - ggplot(aes(y = value, fill = value)) + - geom_bar(aes(x = count / total), stat = "identity", show.legend = F) + + ) + +factors_demo <- tbl %>% + ggplot(aes(fill = value)) + + geom_bar(aes(x = count / total, y = value), stat = "identity", show.legend = F) + theme_pubr() + scale_x_continuous(labels = scales::percent_format(accuracy = 1)) + - labs(x = "Distribution (%)", y="") + - facet_wrap(~factor(Response, labels=c(paste("Low responders (n=", sum(data$Response==0),")"), paste("High responders (n=", sum(data$Response==1),")")))) -factors_demo <- tbl + labs(x = "Distribution (%)", y = "") + + facet_wrap(~ factor(Response, + labels = + c( + paste("Low responders (n=", sum(data$Response == 0), ")"), + paste("High responders (n=", sum(data$Response == 1), ")") + ) + )) +factors_demo <- p2 + factors_demo + plot_layout(widths= c(0.5,9)) data <- orig tbl <- data %>% @@ -52,25 +84,24 @@ tbl <- data %>% ) ) %>% group_by(Age, Response_text) %>% - summarise(count = n(), Response=Response, Response_text=Response_text, Age=Age) %>% + summarise(count = n(), Response = Response, Response_text = Response_text, Age = Age) %>% mutate( total = if_else(Response == 0, sum(data$Response == 0), sum(data$Response == 1)), - percentage = n()/ total * 100 + percentage = n() / total * 100 ) %>% - ggplot(aes(x = Age, y=percentage, fill=factor(Response_text))) + - geom_polygon(show.legend=F) + - labs(x = "Age") + + ggplot(aes(x = Age, y = percentage, fill = factor(Response_text))) + + geom_polygon(show.legend = F) + + labs(x = "Age", y = "Percentage (%)") + theme_pubr() + facet_wrap(~Response_text) age_responder <- tbl figure <- ggarrange( - factors_demo, - age_responder, - ncol=1, - nrow=2, - labels=c("A", "B") + factors_demo, + age_responder, + ncol = 1, + nrow = 2, + labels = c("A", "B") ) -ggsave('../images/demographic.png', figure, width=2*15, height=19, units="cm") - +ggsave("../images/demographic.png", figure, width = 2 * 15, height = 19, units = "cm") diff --git a/scripts/donors_head.R b/scripts/donors_head.R new file mode 100644 index 0000000..818554d --- /dev/null +++ b/scripts/donors_head.R @@ -0,0 +1,29 @@ +library(tidyverse) +library(knitr) +library(dlookr) + +data <- read_csv("../csv/visit_multiple_21.csv") + +tbl <- data %>% + select( + visit_id, + year = visit_year, + day = visit_day, + type = visit_type_hai, + age = age_round, + cmv = cmv_status, + ebv = ebv_status, + bmi, + vaccine, + geo_mean, + d_geo_mean, + response = vaccine_resp, + assay_data_rows = total_data + ) %>% +kable(format = "latex", booktabs = TRUE) + + +clip <- pipe("xclip -selection clipboard", "w") +write(tbl, file = clip) +close(clip) + diff --git a/scripts/experimental_description.R b/scripts/experimental_description.R new file mode 100644 index 0000000..e685844 --- /dev/null +++ b/scripts/experimental_description.R @@ -0,0 +1,105 @@ +library(tidyverse) +library(ggpubr) + +data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv") + +studies_data <- read_csv("../csv/donors_list.csv") %>% + group_by(study_id) %>% + summarise( + count = n() + ) + +tbl <- data %>% + mutate( + assay_type = factor(assay, labels = ( + c( + "MSD", + "Multiplex cytokine", + "Cell Cytometry", + "Multiplex cytokine", + "Phospho cytometry", + "Cell Cytometry", + "Phospho cytometry", + "Complete blood count", + "MSD", + "Cell Cytometry", + "MSD", + "Multiplex cytokine", + "Multiplex cytokine", + "Cell Cytometry" + ) + )) + ) + +id_plt <- tbl %>% + group_by(year, assay, outcome) %>% + summarise( + measurements = n(), + study = study + ) %>% + ggplot(aes(year, measurements / 1000, color = factor(assay))) + + geom_line() + + geom_point() + + ylim(-1, 10) + + facet_grid(rows = vars(factor(outcome, labels = c("low", "high")))) + + labs(y = "number of features (thousands)") + + scale_color_discrete(name="Assay type") + + theme_pubclean() + +type_plt <- tbl %>% + group_by(year, assay_type, outcome) %>% + summarise( + measurements = n(), + study = study + ) %>% + ggplot(aes(year, measurements / 1000, color = factor(assay_type))) + + geom_line(show.legend=F) + + geom_point(show.legend=F) + + ylim(-1, 10) + + facet_grid(rows = vars(factor(outcome, labels = c("low", "high")))) + + labs(y = "number of features (thousands)") + + scale_color_discrete(name="Assay type") + + theme_pubclean() + +study_plt <- tbl %>% + group_by(year, assay_type) %>% + summarise( + measurements = n(), + study = study + ) %>% + ggplot(aes(year, measurements / 1000, color = factor(assay_type))) + + geom_line() + + geom_point() + + ylim(-1, 10) + + facet_grid(rows = vars(factor(study, labels = c( + paste("SLVP015\nn=(", as.numeric(studies_data[studies_data$study_id == 15,][2]), ")"), + paste("SLVP017\nn=(", as.numeric(studies_data[studies_data$study_id == 17,][2]), ")"), + paste("SLVP018\nn=(", as.numeric(studies_data[studies_data$study_id == 18,][2]), ")"), + paste("SLVP021\nn=(", as.numeric(studies_data[studies_data$study_id == 21,][2]), ")"), + paste("SLVP024\nn=(", as.numeric(studies_data[studies_data$study_id == 24,][2]), ")"), + paste("SLVP028\nn=(", as.numeric(studies_data[studies_data$study_id == 28,][2]), ")"), + paste("SLVP029\nn=(", as.numeric(studies_data[studies_data$study_id == 29,][2]), ")"), + paste("SLVP030\nn=(", as.numeric(studies_data[studies_data$study_id == 30,][2]), ")") + )))) + + labs(y = "number of features (thousands)", legend="Assay type") + + scale_color_discrete(name="Assay type") + + theme_pubclean() + + +top_half <- ggarrange( + id_plt, + type_plt, + ncol=2, + labels = c("A","B") + ) + +whole <- ggarrange( + top_half, + study_plt, + nrow=2, + labels=c("", "C"), + heights = c(1,2) +) +whole + +ggsave("../images/exp_data_numbers.pdf", whole, width = 21, height = 29.7, dpi=300, units = "cm") diff --git a/scripts/repeat_investigation.R b/scripts/repeat_investigation.R new file mode 100644 index 0000000..142d221 --- /dev/null +++ b/scripts/repeat_investigation.R @@ -0,0 +1,72 @@ +library(tidyverse) +library(ggpubr) + +data <- read_csv("../csv/repeat_investigation.csv") + +studies_data <- read_csv("../csv/donors_list.csv") %>% + group_by(study_id) %>% + summarise( + count = n() + ) + +tbl <- data %>% + na_if("NULL") %>% + select( + donor_id, + visit_year, + study_id, + visit_type_hai, + vaccine_resp + ) %>% + group_by( + donor_id, study_id, visit_year + ) %>% + summarise( + type = visit_type_hai, + response = vaccine_resp + ) +response_vec <- tbl %>% + group_map( + ~ { + if (any(!is.na(.$response))) { + rep(TRUE, nrow(.)) + } else { + rep(FALSE, nrow(.)) + } + } + ) +tbl_pre <- tbl %>% + ungroup() %>% + mutate(response_recorded = unlist(response_vec)) %>% + filter(type == "pre" | type == "single") %>% + group_by( + donor_id, study_id + ) %>% + summarise( + count = n(), + year = visit_year, + type = type, + response = response, + recorded = response_recorded + ) +tbl_pre + +plt <- tbl_pre %>% + ggplot(aes(count, fill = factor(recorded, labels=c("No (for various reasons)", "Yes")))) + + geom_bar() + + facet_wrap(~ factor(study_id, labels = c( + paste("SLVP015\nn=(", as.numeric(studies_data[studies_data$study_id == 15,][2]), ")"), + paste("SLVP017\nn=(", as.numeric(studies_data[studies_data$study_id == 17,][2]), ")"), + paste("SLVP018\nn=(", as.numeric(studies_data[studies_data$study_id == 18,][2]), ")"), + paste("SLVP021\nn=(", as.numeric(studies_data[studies_data$study_id == 21,][2]), ")"), + paste("SLVP024\nn=(", as.numeric(studies_data[studies_data$study_id == 24,][2]), ")"), + paste("SLVP028\nn=(", as.numeric(studies_data[studies_data$study_id == 28,][2]), ")"), + paste("SLVP029\nn=(", as.numeric(studies_data[studies_data$study_id == 29,][2]), ")"), + paste("SLVP030\nn=(", as.numeric(studies_data[studies_data$study_id == 30,][2]), ")") + ))) + + labs(x="Seasons a donor visited", y = "Visits by donors with same amount of total seasons", fill = "Response classification available") + + theme_pubr() + + scale_x_continuous(breaks = scales::pretty_breaks(n = 8)) +plt + +ggsave("../images/repeat_visits_per_study.png", plt, width = 2*15, height = 25, dpi=300, units = "cm") diff --git a/scripts/visit_166_21 b/scripts/visit_166_21 new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/scripts/visit_166_21 diff --git a/scripts/visit_inconsistencies.R b/scripts/visit_inconsistencies.R new file mode 100644 index 0000000..c495d9f --- /dev/null +++ b/scripts/visit_inconsistencies.R @@ -0,0 +1,132 @@ +library(tidyverse) +library(knitr) +library(dlookr) +library(xlsx) +library(ggpubr) +library(corrplot) + +data <- read_csv("../csv/visits_all.csv") + +class_correct <- data %>% + na_if("NULL") %>% + drop_na(vaccine_resp) %>% + group_by(donor_id, visit_year) %>% + summarise( + post = if_else(any(visit_type_hai == 'post'), TRUE, FALSE), + internal = visit_internal_id, + hai_type = visit_type_hai, + gmt = geo_mean, + fold_change = as.numeric(d_geo_mean), + response = vaccine_resp, + ) + +post_gmt_vec <- class_correct %>% + group_map( + ~ { + if (any(.$hai_type == 'post')) { + rep(.$gmt[.$hai_type == 'post'] >= 40, nrow(.)) + } else { + rep(NA, nrow(.)) + } + } + ) +post_gmt_vec + +post_gmt_vec <- unlist(post_gmt_vec) + +class_correct <- class_correct %>% + ungroup() %>% + mutate( + check_post = if_else(post, 'Post visit', 'No post visit'), + post_gmt = post_gmt_vec, + response = recode(factor(response), '0'='Low', '1'='High'), + check_correct = if_else( + (!is.na(post_gmt) & (!post_gmt | fold_change < 4) & response == 'Low') | + (!is.na(post_gmt) & (post_gmt & fold_change >= 4) & response == 'High'), + 'Correct clasification', + 'False classification' + ), + ) + +seasonal_classification_data <- class_correct %>% + group_by(donor_id, visit_year) %>% + summarise( + response=response, + check_correct=check_correct, + check_post + ) %>% + ungroup() %>% + distinct() + +seasonal_classification_plot <- seasonal_classification_data %>% + ggplot(aes(response, fill = response)) + + geom_bar() + + geom_text(stat='count', aes(label=paste("n = (",stat(count),")")), vjust= +1) + + facet_grid(factor(check_correct) ~ factor(check_post)) + + theme_pubr() + + labs(title="Seasonal classification of donors by correctness and post visit") +seasonal_classification_plot + +ggsave("../images/season_classification.png", seasonal_classification_plot, width = 2 * 15, height = 19, units = "cm") + +incorrect_cases <- class_correct %>% + filter(check_correct == "False classification") + +write.xlsx(incorrect_cases, "../incorrect_visits.xlsx", sheetName="sheet1", col.names=T, row.names=T, append=F) + + + +desc <- data %>% + na_if("NULL") %>% + mutate(across(everything(), as.numeric)) %>% + mutate(geo_mean = na_if(geo_mean, 0)) %>% + select(age, cmv_status, ebv_status, bmi, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>% + describe() %>% + column_to_rownames("variable") %>% + t() %>% + as_tibble(rownames="stat") %>% + slice(1:8) %>% + mutate(across(2:10, round, 1)) %>% +kable(format = "latex", booktabs = TRUE) +clip <- pipe("xclip -selection clipboard", "w") +write(desc, file = clip) +close(clip) + + +corr_pre <- data %>% + select(visit_type_hai, age, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>% + na_if("NULL") %>% + mutate(across(2:length(.), as.numeric)) %>% + filter(!is.na(vaccine_resp) & !is.na(vaccine) & !is.na(d_geo_mean) & !is.na(vaccine_resp) & visit_type_hai == "pre") %>% + select(-visit_type_hai) + +corr_pre + + +library(gridGraphics) +grab_grob <- function(){ + grid.echo() + grid.grab() +} + +corr_post <- data %>% + select(visit_type_hai, age, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>% + na_if("NULL") %>% + mutate(across(2:length(.), as.numeric)) %>% + filter(!is.na(vaccine_resp) & !is.na(vaccine) & !is.na(d_geo_mean) & !is.na(vaccine_resp) & visit_type_hai == "post") %>% + select(-visit_type_hai) +corr_post + + +png(file = "../images/corr_plot_visits_pre_complete.png", + height = 1.5 * 19, + width = 2 * 15, + res = 1200, + units = "cm", +) +par(mfrow=c(2,1)) +M <- cor(corr_pre) +corrplot.mixed(M) +M <- cor(corr_post) +corrplot.mixed(M) +dev.off() diff --git a/scripts/whole_experimental_table.R b/scripts/whole_experimental_table.R new file mode 100644 index 0000000..63d9ce9 --- /dev/null +++ b/scripts/whole_experimental_table.R @@ -0,0 +1,12 @@ +library(tidyverse) + +data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv") + +tbl <- data %>% + rowid_to_column() %>% + pivot_wider( + id_cols = rowid, + names_from = name, + values_from = data, + ) + |
