summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorMike Vink <mike1994vink@gmail.com>2021-04-27 18:31:00 +0200
committerMike Vink <mike1994vink@gmail.com>2021-04-27 18:31:00 +0200
commit2676115f77f0052902e1dcc0632420341464373d (patch)
treeccbcedbcbbb82003003bd0049e6a72e571ada1fc /scripts
parent19c3d0dba64d782e519d8ece36028ebb25b33141 (diff)
checkpoint 27-04-21
Diffstat (limited to 'scripts')
-rw-r--r--scripts/assay_ranges.R38
-rw-r--r--scripts/demographics.R71
-rw-r--r--scripts/donors_head.R29
-rw-r--r--scripts/experimental_description.R105
-rw-r--r--scripts/repeat_investigation.R72
-rw-r--r--scripts/visit_166_210
-rw-r--r--scripts/visit_inconsistencies.R132
-rw-r--r--scripts/whole_experimental_table.R12
8 files changed, 439 insertions, 20 deletions
diff --git a/scripts/assay_ranges.R b/scripts/assay_ranges.R
new file mode 100644
index 0000000..df52083
--- /dev/null
+++ b/scripts/assay_ranges.R
@@ -0,0 +1,38 @@
+library(tidyverse)
+library(ggpubr)
+
+data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv")
+
+tbl <- data %>%
+ mutate(
+ assay_type = factor(assay, labels = (
+ c(
+ "MSD (Arb. Intensity)",
+ "Multiplex cytokine (log2 Z-score)",
+ "Cell Cytometry (% parent popul.)",
+ "Multiplex cytokine (log2 Z-score)",
+ "Phospho cytometry (flow cytometery,\n 90 %tile fold-change)",
+ "Cell Cytometry (% parent popul.)",
+ "Phospho cytometry (Mass cytometry, arcsinh diff.)",
+ "Complete blood count (abs. count/uL)",
+ "MSD (Arb. Intensity)",
+ "Cell Cytometry (% parent popul.)",
+ "MSD (Arb. Intensity)",
+ "Multiplex cytokine (log2 Z-score)",
+ "Multiplex cytokine (log2 Z-score)",
+ "Cell Cytometry (% parent popul.)"
+ )
+ ))
+ ) %>%
+ filter(
+ !(assay_type=="Phospho cytometry (flow cytometery,\n 90 %tile fold-change)"
+ &
+ data > 10000)
+ )
+plt <- tbl %>%
+ ggplot(aes(data, fill=assay_type)) +
+ geom_histogram(show.legend=F) +
+ facet_wrap(~ factor(assay_type), scales="free") +
+ theme_pubclean()
+plt
+ggsave("../images/assay_value_distributions.png", plt, width = 2 * 15, height = 19, units = "cm")
diff --git a/scripts/demographics.R b/scripts/demographics.R
index d43b453..3aee883 100644
--- a/scripts/demographics.R
+++ b/scripts/demographics.R
@@ -1,5 +1,26 @@
library(tidyverse)
+library(grid)
library(ggpubr)
+library(patchwork)
+
+
+
+p1 <- tibble(x = 1:10, y = 1:10) %>%
+ ggplot(aes(x, y)) +
+ geom_point() +
+ scale_y_reverse(breaks = seq(1, 10)) +
+ labs(y = NULL)
+
+p2 <- tibble(ymin = c(0, 2.1, 7.1), ymax = c(1.9, 6.9, 10), fill = c("Gender", "Ethnicity", "CMV status")) %>%
+ ggplot() +
+ geom_rect(aes(xmin = 0, xmax = 1, ymin = ymin, ymax = ymax)) +
+ geom_text(aes(x = .5, y = (ymin + ymax) / 2, label = fill), angle = 90) +
+ scale_y_reverse(breaks = seq(1, 10), expand = expansion(mult = c(0, 0))) +
+ scale_x_continuous(breaks = c(0), expand = expansion(mult = c(0, 0))) +
+ guides(fill = FALSE) +
+ theme_void()
+
+
orig <- read_csv("../csv/donor_demo.csv")
data <- orig
tbl <- data %>%
@@ -8,7 +29,7 @@ tbl <- data %>%
labels = c("CMV-", "CMV+", "CMV unknown")
)) %>%
na_if("NULL") %>%
- replace_na(list(Ethnicity="Unknown")) %>%
+ replace_na(list(Ethnicity = "Unknown")) %>%
pivot_longer(
cols = c("Gender", "Ethnicity", "CMV status")
) %>%
@@ -19,7 +40,10 @@ tbl <- data %>%
count = n(),
) %>%
mutate(
- total = if_else(Response == 1, sum(data$Response == 1), sum(data$Response == 0)),
+ total = if_else(Response == 1,
+ sum(data$Response == 1),
+ sum(data$Response == 0)
+ ),
value = factor(value, levels = rev(c(
"Female",
"Male",
@@ -33,14 +57,22 @@ tbl <- data %>%
"CMV-",
"CMV unknown"
))),
- ) %>%
- ggplot(aes(y = value, fill = value)) +
- geom_bar(aes(x = count / total), stat = "identity", show.legend = F) +
+ )
+
+factors_demo <- tbl %>%
+ ggplot(aes(fill = value)) +
+ geom_bar(aes(x = count / total, y = value), stat = "identity", show.legend = F) +
theme_pubr() +
scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
- labs(x = "Distribution (%)", y="") +
- facet_wrap(~factor(Response, labels=c(paste("Low responders (n=", sum(data$Response==0),")"), paste("High responders (n=", sum(data$Response==1),")"))))
-factors_demo <- tbl
+ labs(x = "Distribution (%)", y = "") +
+ facet_wrap(~ factor(Response,
+ labels =
+ c(
+ paste("Low responders (n=", sum(data$Response == 0), ")"),
+ paste("High responders (n=", sum(data$Response == 1), ")")
+ )
+ ))
+factors_demo <- p2 + factors_demo + plot_layout(widths= c(0.5,9))
data <- orig
tbl <- data %>%
@@ -52,25 +84,24 @@ tbl <- data %>%
)
) %>%
group_by(Age, Response_text) %>%
- summarise(count = n(), Response=Response, Response_text=Response_text, Age=Age) %>%
+ summarise(count = n(), Response = Response, Response_text = Response_text, Age = Age) %>%
mutate(
total = if_else(Response == 0, sum(data$Response == 0), sum(data$Response == 1)),
- percentage = n()/ total * 100
+ percentage = n() / total * 100
) %>%
- ggplot(aes(x = Age, y=percentage, fill=factor(Response_text))) +
- geom_polygon(show.legend=F) +
- labs(x = "Age") +
+ ggplot(aes(x = Age, y = percentage, fill = factor(Response_text))) +
+ geom_polygon(show.legend = F) +
+ labs(x = "Age", y = "Percentage (%)") +
theme_pubr() +
facet_wrap(~Response_text)
age_responder <- tbl
figure <- ggarrange(
- factors_demo,
- age_responder,
- ncol=1,
- nrow=2,
- labels=c("A", "B")
+ factors_demo,
+ age_responder,
+ ncol = 1,
+ nrow = 2,
+ labels = c("A", "B")
)
-ggsave('../images/demographic.png', figure, width=2*15, height=19, units="cm")
-
+ggsave("../images/demographic.png", figure, width = 2 * 15, height = 19, units = "cm")
diff --git a/scripts/donors_head.R b/scripts/donors_head.R
new file mode 100644
index 0000000..818554d
--- /dev/null
+++ b/scripts/donors_head.R
@@ -0,0 +1,29 @@
+library(tidyverse)
+library(knitr)
+library(dlookr)
+
+data <- read_csv("../csv/visit_multiple_21.csv")
+
+tbl <- data %>%
+ select(
+ visit_id,
+ year = visit_year,
+ day = visit_day,
+ type = visit_type_hai,
+ age = age_round,
+ cmv = cmv_status,
+ ebv = ebv_status,
+ bmi,
+ vaccine,
+ geo_mean,
+ d_geo_mean,
+ response = vaccine_resp,
+ assay_data_rows = total_data
+ ) %>%
+kable(format = "latex", booktabs = TRUE)
+
+
+clip <- pipe("xclip -selection clipboard", "w")
+write(tbl, file = clip)
+close(clip)
+
diff --git a/scripts/experimental_description.R b/scripts/experimental_description.R
new file mode 100644
index 0000000..e685844
--- /dev/null
+++ b/scripts/experimental_description.R
@@ -0,0 +1,105 @@
+library(tidyverse)
+library(ggpubr)
+
+data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv")
+
+studies_data <- read_csv("../csv/donors_list.csv") %>%
+ group_by(study_id) %>%
+ summarise(
+ count = n()
+ )
+
+tbl <- data %>%
+ mutate(
+ assay_type = factor(assay, labels = (
+ c(
+ "MSD",
+ "Multiplex cytokine",
+ "Cell Cytometry",
+ "Multiplex cytokine",
+ "Phospho cytometry",
+ "Cell Cytometry",
+ "Phospho cytometry",
+ "Complete blood count",
+ "MSD",
+ "Cell Cytometry",
+ "MSD",
+ "Multiplex cytokine",
+ "Multiplex cytokine",
+ "Cell Cytometry"
+ )
+ ))
+ )
+
+id_plt <- tbl %>%
+ group_by(year, assay, outcome) %>%
+ summarise(
+ measurements = n(),
+ study = study
+ ) %>%
+ ggplot(aes(year, measurements / 1000, color = factor(assay))) +
+ geom_line() +
+ geom_point() +
+ ylim(-1, 10) +
+ facet_grid(rows = vars(factor(outcome, labels = c("low", "high")))) +
+ labs(y = "number of features (thousands)") +
+ scale_color_discrete(name="Assay type") +
+ theme_pubclean()
+
+type_plt <- tbl %>%
+ group_by(year, assay_type, outcome) %>%
+ summarise(
+ measurements = n(),
+ study = study
+ ) %>%
+ ggplot(aes(year, measurements / 1000, color = factor(assay_type))) +
+ geom_line(show.legend=F) +
+ geom_point(show.legend=F) +
+ ylim(-1, 10) +
+ facet_grid(rows = vars(factor(outcome, labels = c("low", "high")))) +
+ labs(y = "number of features (thousands)") +
+ scale_color_discrete(name="Assay type") +
+ theme_pubclean()
+
+study_plt <- tbl %>%
+ group_by(year, assay_type) %>%
+ summarise(
+ measurements = n(),
+ study = study
+ ) %>%
+ ggplot(aes(year, measurements / 1000, color = factor(assay_type))) +
+ geom_line() +
+ geom_point() +
+ ylim(-1, 10) +
+ facet_grid(rows = vars(factor(study, labels = c(
+ paste("SLVP015\nn=(", as.numeric(studies_data[studies_data$study_id == 15,][2]), ")"),
+ paste("SLVP017\nn=(", as.numeric(studies_data[studies_data$study_id == 17,][2]), ")"),
+ paste("SLVP018\nn=(", as.numeric(studies_data[studies_data$study_id == 18,][2]), ")"),
+ paste("SLVP021\nn=(", as.numeric(studies_data[studies_data$study_id == 21,][2]), ")"),
+ paste("SLVP024\nn=(", as.numeric(studies_data[studies_data$study_id == 24,][2]), ")"),
+ paste("SLVP028\nn=(", as.numeric(studies_data[studies_data$study_id == 28,][2]), ")"),
+ paste("SLVP029\nn=(", as.numeric(studies_data[studies_data$study_id == 29,][2]), ")"),
+ paste("SLVP030\nn=(", as.numeric(studies_data[studies_data$study_id == 30,][2]), ")")
+ )))) +
+ labs(y = "number of features (thousands)", legend="Assay type") +
+ scale_color_discrete(name="Assay type") +
+ theme_pubclean()
+
+
+top_half <- ggarrange(
+ id_plt,
+ type_plt,
+ ncol=2,
+ labels = c("A","B")
+ )
+
+whole <- ggarrange(
+ top_half,
+ study_plt,
+ nrow=2,
+ labels=c("", "C"),
+ heights = c(1,2)
+)
+whole
+
+ggsave("../images/exp_data_numbers.pdf", whole, width = 21, height = 29.7, dpi=300, units = "cm")
diff --git a/scripts/repeat_investigation.R b/scripts/repeat_investigation.R
new file mode 100644
index 0000000..142d221
--- /dev/null
+++ b/scripts/repeat_investigation.R
@@ -0,0 +1,72 @@
+library(tidyverse)
+library(ggpubr)
+
+data <- read_csv("../csv/repeat_investigation.csv")
+
+studies_data <- read_csv("../csv/donors_list.csv") %>%
+ group_by(study_id) %>%
+ summarise(
+ count = n()
+ )
+
+tbl <- data %>%
+ na_if("NULL") %>%
+ select(
+ donor_id,
+ visit_year,
+ study_id,
+ visit_type_hai,
+ vaccine_resp
+ ) %>%
+ group_by(
+ donor_id, study_id, visit_year
+ ) %>%
+ summarise(
+ type = visit_type_hai,
+ response = vaccine_resp
+ )
+response_vec <- tbl %>%
+ group_map(
+ ~ {
+ if (any(!is.na(.$response))) {
+ rep(TRUE, nrow(.))
+ } else {
+ rep(FALSE, nrow(.))
+ }
+ }
+ )
+tbl_pre <- tbl %>%
+ ungroup() %>%
+ mutate(response_recorded = unlist(response_vec)) %>%
+ filter(type == "pre" | type == "single") %>%
+ group_by(
+ donor_id, study_id
+ ) %>%
+ summarise(
+ count = n(),
+ year = visit_year,
+ type = type,
+ response = response,
+ recorded = response_recorded
+ )
+tbl_pre
+
+plt <- tbl_pre %>%
+ ggplot(aes(count, fill = factor(recorded, labels=c("No (for various reasons)", "Yes")))) +
+ geom_bar() +
+ facet_wrap(~ factor(study_id, labels = c(
+ paste("SLVP015\nn=(", as.numeric(studies_data[studies_data$study_id == 15,][2]), ")"),
+ paste("SLVP017\nn=(", as.numeric(studies_data[studies_data$study_id == 17,][2]), ")"),
+ paste("SLVP018\nn=(", as.numeric(studies_data[studies_data$study_id == 18,][2]), ")"),
+ paste("SLVP021\nn=(", as.numeric(studies_data[studies_data$study_id == 21,][2]), ")"),
+ paste("SLVP024\nn=(", as.numeric(studies_data[studies_data$study_id == 24,][2]), ")"),
+ paste("SLVP028\nn=(", as.numeric(studies_data[studies_data$study_id == 28,][2]), ")"),
+ paste("SLVP029\nn=(", as.numeric(studies_data[studies_data$study_id == 29,][2]), ")"),
+ paste("SLVP030\nn=(", as.numeric(studies_data[studies_data$study_id == 30,][2]), ")")
+ ))) +
+ labs(x="Seasons a donor visited", y = "Visits by donors with same amount of total seasons", fill = "Response classification available") +
+ theme_pubr() +
+ scale_x_continuous(breaks = scales::pretty_breaks(n = 8))
+plt
+
+ggsave("../images/repeat_visits_per_study.png", plt, width = 2*15, height = 25, dpi=300, units = "cm")
diff --git a/scripts/visit_166_21 b/scripts/visit_166_21
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/scripts/visit_166_21
diff --git a/scripts/visit_inconsistencies.R b/scripts/visit_inconsistencies.R
new file mode 100644
index 0000000..c495d9f
--- /dev/null
+++ b/scripts/visit_inconsistencies.R
@@ -0,0 +1,132 @@
+library(tidyverse)
+library(knitr)
+library(dlookr)
+library(xlsx)
+library(ggpubr)
+library(corrplot)
+
+data <- read_csv("../csv/visits_all.csv")
+
+class_correct <- data %>%
+ na_if("NULL") %>%
+ drop_na(vaccine_resp) %>%
+ group_by(donor_id, visit_year) %>%
+ summarise(
+ post = if_else(any(visit_type_hai == 'post'), TRUE, FALSE),
+ internal = visit_internal_id,
+ hai_type = visit_type_hai,
+ gmt = geo_mean,
+ fold_change = as.numeric(d_geo_mean),
+ response = vaccine_resp,
+ )
+
+post_gmt_vec <- class_correct %>%
+ group_map(
+ ~ {
+ if (any(.$hai_type == 'post')) {
+ rep(.$gmt[.$hai_type == 'post'] >= 40, nrow(.))
+ } else {
+ rep(NA, nrow(.))
+ }
+ }
+ )
+post_gmt_vec
+
+post_gmt_vec <- unlist(post_gmt_vec)
+
+class_correct <- class_correct %>%
+ ungroup() %>%
+ mutate(
+ check_post = if_else(post, 'Post visit', 'No post visit'),
+ post_gmt = post_gmt_vec,
+ response = recode(factor(response), '0'='Low', '1'='High'),
+ check_correct = if_else(
+ (!is.na(post_gmt) & (!post_gmt | fold_change < 4) & response == 'Low') |
+ (!is.na(post_gmt) & (post_gmt & fold_change >= 4) & response == 'High'),
+ 'Correct clasification',
+ 'False classification'
+ ),
+ )
+
+seasonal_classification_data <- class_correct %>%
+ group_by(donor_id, visit_year) %>%
+ summarise(
+ response=response,
+ check_correct=check_correct,
+ check_post
+ ) %>%
+ ungroup() %>%
+ distinct()
+
+seasonal_classification_plot <- seasonal_classification_data %>%
+ ggplot(aes(response, fill = response)) +
+ geom_bar() +
+ geom_text(stat='count', aes(label=paste("n = (",stat(count),")")), vjust= +1) +
+ facet_grid(factor(check_correct) ~ factor(check_post)) +
+ theme_pubr() +
+ labs(title="Seasonal classification of donors by correctness and post visit")
+seasonal_classification_plot
+
+ggsave("../images/season_classification.png", seasonal_classification_plot, width = 2 * 15, height = 19, units = "cm")
+
+incorrect_cases <- class_correct %>%
+ filter(check_correct == "False classification")
+
+write.xlsx(incorrect_cases, "../incorrect_visits.xlsx", sheetName="sheet1", col.names=T, row.names=T, append=F)
+
+
+
+desc <- data %>%
+ na_if("NULL") %>%
+ mutate(across(everything(), as.numeric)) %>%
+ mutate(geo_mean = na_if(geo_mean, 0)) %>%
+ select(age, cmv_status, ebv_status, bmi, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>%
+ describe() %>%
+ column_to_rownames("variable") %>%
+ t() %>%
+ as_tibble(rownames="stat") %>%
+ slice(1:8) %>%
+ mutate(across(2:10, round, 1)) %>%
+kable(format = "latex", booktabs = TRUE)
+clip <- pipe("xclip -selection clipboard", "w")
+write(desc, file = clip)
+close(clip)
+
+
+corr_pre <- data %>%
+ select(visit_type_hai, age, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>%
+ na_if("NULL") %>%
+ mutate(across(2:length(.), as.numeric)) %>%
+ filter(!is.na(vaccine_resp) & !is.na(vaccine) & !is.na(d_geo_mean) & !is.na(vaccine_resp) & visit_type_hai == "pre") %>%
+ select(-visit_type_hai)
+
+corr_pre
+
+
+library(gridGraphics)
+grab_grob <- function(){
+ grid.echo()
+ grid.grab()
+}
+
+corr_post <- data %>%
+ select(visit_type_hai, age, vaccine, geo_mean, d_geo_mean, vaccine_resp, total_data) %>%
+ na_if("NULL") %>%
+ mutate(across(2:length(.), as.numeric)) %>%
+ filter(!is.na(vaccine_resp) & !is.na(vaccine) & !is.na(d_geo_mean) & !is.na(vaccine_resp) & visit_type_hai == "post") %>%
+ select(-visit_type_hai)
+corr_post
+
+
+png(file = "../images/corr_plot_visits_pre_complete.png",
+ height = 1.5 * 19,
+ width = 2 * 15,
+ res = 1200,
+ units = "cm",
+)
+par(mfrow=c(2,1))
+M <- cor(corr_pre)
+corrplot.mixed(M)
+M <- cor(corr_post)
+corrplot.mixed(M)
+dev.off()
diff --git a/scripts/whole_experimental_table.R b/scripts/whole_experimental_table.R
new file mode 100644
index 0000000..63d9ce9
--- /dev/null
+++ b/scripts/whole_experimental_table.R
@@ -0,0 +1,12 @@
+library(tidyverse)
+
+data <- read_csv("../csv/data_per_assay_per_year_per_outcome.csv")
+
+tbl <- data %>%
+ rowid_to_column() %>%
+ pivot_wider(
+ id_cols = rowid,
+ names_from = name,
+ values_from = data,
+ )
+