checkpoint 27-04-21

author: Mike Vink <mike1994vink@gmail.com> 2021-04-27 18:31:00 +0200
committer: Mike Vink <mike1994vink@gmail.com> 2021-04-27 18:31:00 +0200
commit: 2676115f77f0052902e1dcc0632420341464373d (patch)
tree: ccbcedbcbbb82003003bd0049e6a72e571ada1fc /scripts
parent: 19c3d0dba64d782e519d8ece36028ebb25b33141 (diff)
8 files changed, 439 insertions, 20 deletions
diff --git a/scripts/assay_ranges.R b/scripts/assay_ranges.R
new file mode 100644
index 0000000..df52083
--- /dev/null
+++ b/scripts/assay_ranges.R
@@ -0,0 +1,38 @@
+library(tidyverse)
+library(ggpubr)
+
+data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv")
+
+tbl <- data %>%
+    mutate(
+        assay_type = factor(assay, labels = (
+            c(
+                "MSD (Arb. Intensity)",
+                "Multiplex cytokine (log2 Z-score)",
+                "Cell Cytometry (% parent popul.)",
+                "Multiplex cytokine (log2 Z-score)",
+                "Phospho cytometry (flow cytometery,\n 90 %tile fold-change)",
+                "Cell Cytometry (% parent popul.)",
+                "Phospho cytometry (Mass cytometry, arcsinh diff.)",
+                "Complete blood count (abs. count/uL)",
+                "MSD (Arb. Intensity)",
+                "Cell Cytometry (% parent popul.)",
+                "MSD (Arb. Intensity)",
+                "Multiplex cytokine (log2 Z-score)",
+                "Multiplex cytokine (log2 Z-score)",
+                "Cell Cytometry (% parent popul.)"
+            )
+        ))
+    ) %>%
+    filter(
+           !(assay_type=="Phospho cytometry (flow cytometery,\n 90 %tile fold-change)"
+           &
+            data > 10000)
+    )
+plt <- tbl %>%
+    ggplot(aes(data, fill=assay_type)) +
+    geom_histogram(show.legend=F) +
+    facet_wrap(~ factor(assay_type), scales="free") +
+    theme_pubclean()
+plt
+ggsave("../images/assay_value_distributions.png", plt, width = 2 * 15, height = 19, units = "cm")
diff --git a/scripts/demographics.R b/scripts/demographics.R
index d43b453..3aee883 100644
--- a/scripts/demographics.R
+++ b/scripts/demographics.R
@@ -1,5 +1,26 @@
 library(tidyverse)
+library(grid)
 library(ggpubr)
+library(patchwork)
+
+
+
+p1 <- tibble(x = 1:10, y = 1:10) %>%
+  ggplot(aes(x, y)) +
+  geom_point() +
+  scale_y_reverse(breaks = seq(1, 10)) +
+  labs(y = NULL)
+
+p2 <- tibble(ymin = c(0, 2.1, 7.1), ymax = c(1.9, 6.9, 10), fill = c("Gender", "Ethnicity", "CMV status")) %>%
+  ggplot() +
+  geom_rect(aes(xmin = 0, xmax = 1, ymin = ymin, ymax = ymax)) +
+  geom_text(aes(x = .5, y = (ymin  + ymax) / 2, label = fill), angle = 90) +
+  scale_y_reverse(breaks = seq(1, 10), expand = expansion(mult = c(0, 0))) +
+  scale_x_continuous(breaks = c(0), expand = expansion(mult = c(0, 0))) +
+  guides(fill = FALSE) +
+  theme_void()
+
+
 orig <- read_csv("../csv/donor_demo.csv")
 data <- orig
 tbl <- data %>%
@@ -8,7 +29,7 @@ tbl <- data %>%
         labels = c("CMV-", "CMV+", "CMV unknown")
     )) %>%
     na_if("NULL") %>%
-    replace_na(list(Ethnicity="Unknown")) %>%
+    replace_na(list(Ethnicity = "Unknown")) %>%
     pivot_longer(
         cols = c("Gender", "Ethnicity", "CMV status")
     ) %>%
@@ -19,7 +40,10 @@ tbl <- data %>%
         count = n(),
     ) %>%
     mutate(
-        total = if_else(Response == 1, sum(data$Response == 1), sum(data$Response == 0)),
+        total = if_else(Response == 1,
+            sum(data$Response == 1),
+            sum(data$Response == 0)
+        ),
         value = factor(value, levels = rev(c(
             "Female",
             "Male",
@@ -33,14 +57,22 @@ tbl <- data %>%
             "CMV-",
             "CMV unknown"
         ))),
-    ) %>%
-    ggplot(aes(y = value, fill = value)) +
-    geom_bar(aes(x = count / total), stat = "identity", show.legend = F) +
+    )
+
+factors_demo <- tbl %>%
+    ggplot(aes(fill = value)) +
+    geom_bar(aes(x = count / total, y = value), stat = "identity", show.legend = F) +
     theme_pubr() +
     scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
-    labs(x = "Distribution (%)", y="") +
-    facet_wrap(~factor(Response, labels=c(paste("Low responders (n=", sum(data$Response==0),")"), paste("High responders (n=", sum(data$Response==1),")"))))
-factors_demo <- tbl
+    labs(x = "Distribution (%)", y = "") +
+    facet_wrap(~ factor(Response,
+        labels =
+            c(
+                paste("Low responders (n=", sum(data$Response == 0), ")"),
+                paste("High responders (n=", sum(data$Response == 1), ")")
+            )
+    ))
+factors_demo <- p2 + factors_demo + plot_layout(widths= c(0.5,9))
 
 data <- orig
 tbl <- data %>%
@@ -52,25 +84,24 @@ tbl <- data %>%
         )
     ) %>%
     group_by(Age, Response_text) %>%
-    summarise(count = n(), Response=Response, Response_text=Response_text, Age=Age) %>%
+    summarise(count = n(), Response = Response, Response_text = Response_text, Age = Age) %>%
     mutate(
         total = if_else(Response == 0, sum(data$Response == 0), sum(data$Response == 1)),
-        percentage = n()/ total * 100
+        percentage = n() / total * 100
     ) %>%
-    ggplot(aes(x = Age, y=percentage, fill=factor(Response_text))) +
-    geom_polygon(show.legend=F) +
-    labs(x = "Age") +
+    ggplot(aes(x = Age, y = percentage, fill = factor(Response_text))) +
+    geom_polygon(show.legend = F) +
+    labs(x = "Age", y = "Percentage (%)") +
     theme_pubr() +
     facet_wrap(~Response_text)
 age_responder <- tbl
 
 figure <- ggarrange(
-          factors_demo,
-          age_responder,
-          ncol=1,
-          nrow=2,
-          labels=c("A", "B")
+    factors_demo,
+    age_responder,
+    ncol = 1,
+    nrow = 2,
+    labels = c("A", "B")
 )
 
-ggsave('../images/demographic.png', figure, width=2*15, height=19, units="cm")
-
+ggsave("../images/demographic.png", figure, width = 2 * 15, height = 19, units = "cm")
diff --git a/scripts/donors_head.R b/scripts/donors_head.R
new file mode 100644
index 0000000..818554d
--- /dev/null
+++ b/scripts/donors_head.R
@@ -0,0 +1,29 @@
+library(tidyverse)
+library(knitr)
+library(dlookr)
+
+data <- read_csv("../csv/visit_multiple_21.csv")
+
+tbl <- data %>%
+    select(
+        visit_id,
+        year = visit_year,
+        day = visit_day,
+        type = visit_type_hai,
+        age = age_round,
+        cmv = cmv_status,
+        ebv = ebv_status,
+        bmi,
+        vaccine,
+        geo_mean,
+        d_geo_mean,
+        response = vaccine_resp,
+        assay_data_rows = total_data
+    ) %>%
+kable(format = "latex", booktabs = TRUE)
+
+
+clip <- pipe("xclip -selection clipboard", "w")
+write(tbl, file = clip)
+close(clip)
+
diff --git a/scripts/experimental_description.R b/scripts/experimental_description.R
new file mode 100644
index 0000000..e685844
--- /dev/null
+++ b/scripts/experimental_description.R
@@ -0,0 +1,105 @@
+library(tidyverse)
+library(ggpubr)
+
+data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv")
+
+studies_data <- read_csv("../csv/donors_list.csv") %>%
+    group_by(study_id) %>%
+    summarise(
+              count = n()
+              )
+
+tbl <- data %>%
+    mutate(
+        assay_type = factor(assay, labels = (
+            c(
+                "MSD",
+                "Multiplex cytokine",
+                "Cell Cytometry",
+                "Multiplex cytokine",
+                "Phospho cytometry",
+                "Cell Cytometry",
+                "Phospho cytometry",
+                "Complete blood count",
+                "MSD",
+                "Cell Cytometry",
+                "MSD",
+                "Multiplex cytokine",
+                "Multiplex cytokine",
+                "Cell Cytometry"
+            )
+        ))
+    )
+
+id_plt <- tbl %>%
+    group_by(year, assay, outcome) %>%
+    summarise(
+        measurements = n(),
+        study = study
+    ) %>%
+    ggplot(aes(year, measurements / 1000, color = factor(assay))) +
+    geom_line() +
+    geom_point() +
+    ylim(-1, 10) +
+    facet_grid(rows = vars(factor(outcome, labels = c("low", "high")))) +
+    labs(y = "number of features (thousands)") +
+    scale_color_discrete(name="Assay type") +
+    theme_pubclean()
+
+type_plt <- tbl %>%
+    group_by(year, assay_type, outcome) %>%
+    summarise(
+        measurements = n(),
+        study = study
+    ) %>%
+    ggplot(aes(year, measurements / 1000, color = factor(assay_type))) +
+    geom_line(show.legend=F) +
+    geom_point(show.legend=F) +
+    ylim(-1, 10) +
+    facet_grid(rows = vars(factor(outcome, labels = c("low", "high")))) +
+    labs(y = "number of features (thousands)") +
+    scale_color_discrete(name="Assay type") +
+    theme_pubclean()
+
+study_plt <- tbl %>%
+    group_by(year, assay_type) %>%
+    summarise(
+        measurements = n(),
+        study = study
+    ) %>%
+    ggplot(aes(year, measurements / 1000, color = factor(assay_type))) +
+    geom_line() +
+    geom_point() +
+    ylim(-1, 10) +
+    facet_grid(rows = vars(factor(study, labels = c(
+        paste("SLVP015\nn=(", as.numeric(studies_data[studies_data$study_id == 15,][2]), ")"),
+        paste("SLVP017\nn=(", as.numeric(studies_data[studies_data$study_id == 17,][2]), ")"),
+        paste("SLVP018\nn=(", as.numeric(studies_data[studies_data$study_id == 18,][2]), ")"),
+        paste("SLVP021\nn=(", as.numeric(studies_data[studies_data$study_id == 21,][2]), ")"),
+        paste("SLVP024\nn=(", as.numeric(studies_data[studies_data$study_id == 24,][2]), ")"),
+        paste("SLVP028\nn=(", as.numeric(studies_data[studies_data$study_id == 28,][2]), ")"),
+        paste("SLVP029\nn=(", as.numeric(studies_data[studies_data$study_id == 29,][2]), ")"),
+        paste("SLVP030\nn=(", as.numeric(studies_data[studies_data$study_id == 30,][2]), ")")
+    )))) +
+    labs(y = "number of features (thousands)", legend="Assay type") +
+    scale_color_discrete(name="Assay type") +
+    theme_pubclean()
+
+
+top_half <- ggarrange(
+                      id_plt,
+                      type_plt,
+                      ncol=2,
+                      labels = c("A","B")
+          )
+
+whole <- ggarrange(
+                   top_half,
+                   study_plt,
+                   nrow=2,
+                   labels=c("", "C"),
+                   heights = c(1,2)
+)
+whole
+
+ggsave("../images/exp_data_numbers.pdf", whole, width = 21, height = 29.7, dpi=300, units = "cm")
diff --git a/scripts/repeat_investigation.R b/scripts/repeat_investigation.R
new file mode 100644
index 0000000..142d221
--- /dev/null
+++ b/scripts/repeat_investigation.R
@@ -0,0 +1,72 @@
+library(tidyverse)
+library(ggpubr)
+
+data <- read_csv("../csv/repeat_investigation.csv")
+
+studies_data <- read_csv("../csv/donors_list.csv") %>%
+    group_by(study_id) %>%
+    summarise(
+              count = n()
+              )
+
+tbl <- data %>%
+    na_if("NULL") %>%
+    select(
+           donor_id,
+           visit_year,
+           study_id,
+           visit_type_hai,
+           vaccine_resp
+    ) %>%
+    group_by(
+             donor_id, study_id, visit_year
+             ) %>%
+    summarise(
+              type = visit_type_hai,
+              response = vaccine_resp
+    )
+response_vec <- tbl %>%
+    group_map(
+              ~ {
+                  if (any(!is.na(.$response))) {
+                      rep(TRUE, nrow(.))
+                  } else {
+                      rep(FALSE, nrow(.))
+                  }
+              }
+    )
+tbl_pre <- tbl %>%
+    ungroup() %>%
+    mutate(response_recorded = unlist(response_vec)) %>%
+    filter(type == "pre" | type == "single") %>%
+    group_by(
+             donor_id, study_id
+             ) %>%
+    summarise(
+              count = n(),
+              year = visit_year,
+              type = type,
+              response = response,
+              recorded = response_recorded
+    )
+tbl_pre
+
+plt <- tbl_pre %>%
+    ggplot(aes(count, fill = factor(recorded, labels=c("No (for various reasons)", "Yes")))) +
+    geom_bar() +
+    facet_wrap(~ factor(study_id, labels = c(
+        paste("SLVP015\nn=(", as.numeric(studies_data[studies_data$study_id == 15,][2]), ")"),
+        paste("SLVP017\nn=(", as.numeric(studies_data[studies_data$study_id == 17,][2]), ")"),
+        paste("SLVP018\nn=(", as.numeric(studies_data[studies_data$study_id == 18,][2]), ")"),
+        paste("SLVP021\nn=(", as.numeric(studies_data[studies_data$study_id == 21,][2]), ")"),
+        paste("SLVP024\nn=(", as.numeric(studies_data[studies_data$study_id == 24,][2]), ")"),
+        paste("SLVP028\nn=(", as.numeric(studies_data[studies_data$study_id == 28,][2]), ")"),
+        paste("SLVP029\nn=(", as.numeric(studies_data[studies_data$study_id == 29,][2]), ")"),
+        paste("SLVP030\nn=(", as.numeric(studies_data[studies_data$study_id == 30,][2]), ")")
+    ))) +
+    labs(x="Seasons a donor visited", y = "Visits by donors with same amount of total seasons", fill = "Response classification available") +
+    theme_pubr() +
+    scale_x_continuous(breaks = scales::pretty_breaks(n = 8))
+plt
+
+ggsave("../images/repeat_visits_per_study.png", plt, width = 2*15, height = 25, dpi=300, units = "cm")
diff --git a/scripts/visit_166_21 b/scripts/visit_166_21
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/scripts/visit_166_21
diff --git a/scripts/visit_inconsistencies.R b/scripts/visit_inconsistencies.R
new file mode 100644
index 0000000..c495d9f
--- /dev/null
+++ b/scripts/visit_inconsistencies.R
@@ -0,0 +1,132 @@
+library(tidyverse)
+library(knitr)
+library(dlookr)
+library(xlsx)
+library(ggpubr)
+library(corrplot)
+
+data <- read_csv("../csv/visits_all.csv")
+
+class_correct <- data %>%
+    na_if("NULL") %>%
+    drop_na(vaccine_resp) %>%
+    group_by(donor_id, visit_year) %>%
+    summarise(
+              post = if_else(any(visit_type_hai == 'post'), TRUE, FALSE),
+              internal = visit_internal_id,
+              hai_type = visit_type_hai,
+              gmt = geo_mean,
+              fold_change = as.numeric(d_geo_mean),
+              response = vaccine_resp,
+    )
+
+post_gmt_vec <- class_correct %>%
+    group_map(
+              ~ {
+                  if (any(.$hai_type == 'post')) {
+                      rep(.$gmt[.$hai_type == 'post'] >= 40, nrow(.))
+                  } else {
+                      rep(NA, nrow(.))
+                  }
+              }
+    )
+post_gmt_vec
+
+post_gmt_vec <- unlist(post_gmt_vec)
+
+class_correct <- class_correct %>%
+    ungroup() %>%
+    mutate(
+           check_post = if_else(post, 'Post visit', 'No post visit'),
+           post_gmt = post_gmt_vec,
+           response = recode(factor(response), '0'='Low', '1'='High'),
+           check_correct = if_else(
+                                   (!is.na(post_gmt) & (!post_gmt | fold_change < 4) & response == 'Low') |
+                                       (!is.na(post_gmt) & (post_gmt & fold_change >= 4) & response == 'High'),
+                                   'Correct clasification',
+                                   'False classification'
+                                   ),
+           )
+
+seasonal_classification_data <- class_correct %>%
+    group_by(donor_id, visit_year) %>%
+    summarise(
+              response=response,
+              check_correct=check_correct,
+              check_post
+    ) %>%
+    ungroup() %>%
+    distinct()
+
+seasonal_classification_plot <- seasonal_classification_data %>%
+    ggplot(aes(response, fill = response)) +
+    geom_bar() +
+    geom_text(stat='count', aes(label=paste("n = (",stat(count),")")), vjust= +1) +
+    facet_grid(factor(check_correct) ~ factor(check_post)) +
+    theme_pubr() +
+    labs(title="Seasonal classification of donors by correctness and post visit")
+seasonal_classification_plot
+
+ggsave("../images/season_classification.png", seasonal_classification_plot, width = 2 * 15, height = 19, units = "cm")
+
+incorrect_cases <- class_correct %>%
+    filter(check_correct == "False classification")
+
+write.xlsx(incorrect_cases, "../incorrect_visits.xlsx", sheetName="sheet1", col.names=T, row.names=T, append=F)
+
+
+
+desc <- data %>%
+    na_if("NULL") %>%
+    mutate(across(everything(), as.numeric)) %>%
+    mutate(geo_mean = na_if(geo_mean, 0)) %>%
+    select(age, cmv_status, ebv_status, bmi, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>%
+    describe() %>%
+    column_to_rownames("variable") %>%
+    t() %>%
+    as_tibble(rownames="stat") %>%
+    slice(1:8) %>%
+    mutate(across(2:10, round, 1)) %>%
+kable(format = "latex", booktabs = TRUE)
+clip <- pipe("xclip -selection clipboard", "w")
+write(desc, file = clip)
+close(clip)
+
+
+corr_pre <- data %>%
+    select(visit_type_hai, age, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>%
+    na_if("NULL") %>%
+    mutate(across(2:length(.), as.numeric)) %>%
+    filter(!is.na(vaccine_resp) & !is.na(vaccine) & !is.na(d_geo_mean) & !is.na(vaccine_resp) & visit_type_hai == "pre") %>%
+    select(-visit_type_hai)
+
+corr_pre
+
+
+library(gridGraphics)
+grab_grob <- function(){
+  grid.echo()
+  grid.grab()
+}
+
+corr_post <- data %>%
+    select(visit_type_hai, age, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>%
+    na_if("NULL") %>%
+    mutate(across(2:length(.), as.numeric)) %>%
+    filter(!is.na(vaccine_resp) & !is.na(vaccine) & !is.na(d_geo_mean) & !is.na(vaccine_resp) & visit_type_hai == "post") %>%
+    select(-visit_type_hai)
+corr_post
+
+
+png(file = "../images/corr_plot_visits_pre_complete.png",
+    height = 1.5 * 19,
+    width = 2 * 15,
+    res = 1200,
+    units = "cm",
+)
+par(mfrow=c(2,1))
+M <- cor(corr_pre)
+corrplot.mixed(M)
+M <- cor(corr_post)
+corrplot.mixed(M)
+dev.off()
diff --git a/scripts/whole_experimental_table.R b/scripts/whole_experimental_table.R
new file mode 100644
index 0000000..63d9ce9
--- /dev/null
+++ b/scripts/whole_experimental_table.R
@@ -0,0 +1,12 @@
+library(tidyverse)
+
+data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv")
+
+tbl <- data %>%
+    rowid_to_column() %>%
+    pivot_wider(
+                id_cols = rowid,
+                names_from = name,
+                values_from = data,
+    )
+
author	Mike Vink <mike1994vink@gmail.com>	2021-04-27 18:31:00 +0200
committer	Mike Vink <mike1994vink@gmail.com>	2021-04-27 18:31:00 +0200
commit	2676115f77f0052902e1dcc0632420341464373d (patch)
tree	ccbcedbcbbb82003003bd0049e6a72e571ada1fc /scripts
parent	19c3d0dba64d782e519d8ece36028ebb25b33141 (diff)