#* coherence_results_correlation: #* attr: #* fillcolor: '7' #* desc: Topic visualizations - coherence, statistical significance in test groups, #* mean and variance usage in test groups, etc. #* ext: R #* inputs: #* - model_tests_covid_pvals #* library(dplyr) library(tidyr) library(ggplot2) library(ggrepel) library(stringr) coherence_results_correlation <- function(model_tests_covid_pvals, coherence_by_topic, group_topic_stats) { # do a quick check of correlation between mean and variance across cohorts, groups, epochs, and topics - mean and variance are correlated but lets quantify it print("Mean topic usage vs variance topic usage, across cohorts, groups, topics, epochs (post and pre), Spearman rank correlation:") print(cor(group_topic_stats$mean_topic_prob, group_topic_stats$var_topic_prob, method = "spearman")) print("Mean topic usage vs variance topic usage, across cohorts, groups, topics, epochs (post and pre), Pearson correlation:") print(cor(group_topic_stats$mean_topic_prob, group_topic_stats$var_topic_prob, method = "pearson")) # This is all self-inflicted pain due to my terrible naming conventions :/ contrasts <- model_tests_covid_pvals %>% filter(!grepl("Overall", contrast)) %>% # keep only contrast tests extract(contrast, into = "cohort_simple", regex = ".*(COVID|PASC).*", remove = FALSE) %>% mutate(cohort_simple = case_when( cohort_simple == "COVID" ~ "covid-no-pasc", cohort_simple == "PASC" ~ "pasc", TRUE ~ cohort_simple )) %>% extract(contrast, into = "group", regex = ".*(Adol|Adult|Senior|Ped|Female|Male|Early|Alpha|Delta).*", remove = FALSE) %>% mutate(group = tolower(group)) %>% mutate(group = case_when( group == "adol" ~ "adolescent", group == "ped" ~ "pediatric", TRUE ~ group )) %>% merge(group_topic_stats %>% filter(epoch == "post") %>% # only post epoch for these visualizations mutate(group = case_when( group == "MALE" ~ "male", group == "FEMALE" ~ "female", TRUE ~ group )), by = c("cohort_simple", "group", "topic_name"), all = TRUE) %>% mutate(plot_label = case_when( p_adj < 0.05 & (exp(estimate) > 2 | exp(estimate) < 0.5) ~ topic_name, TRUE ~ "" )) %>% mutate(group = case_when( group == "adolescent" ~ "Adolescent", group == "pediatric" ~ "Pediatric", group == "adult" ~ "Adult", group == "senior" ~ "Senior", group == "early" ~ "Early", group == "alpha" ~ "Alpha", group == "delta" ~ "Delta", group == "male" ~ "Male", group == "female" ~ "Female", )) control_data <- contrasts %>% filter(cohort_simple == "control") %>% select(group, topic_name, mean_topic_prob, var_topic_prob, count_patients) %>% rename(control_mean_topic_prob = mean_topic_prob, control_var_topic_prob = var_topic_prob, control_count_patients = count_patients) full_data <- contrasts %>% filter(cohort_simple != "control") %>% select(grouper, cohort_simple, group, topic_name, plot_label, estimate, p_adj, mean_topic_prob, var_topic_prob, count_patients) %>% merge(control_data, by = c("group", "topic_name"), all = TRUE) %>% merge(coherence_by_topic, by = "topic_name") p <- ggplot(full_data %>% filter(cohort_simple == "pasc")) + geom_point(aes(x = sum_coherence, y = estimate, size = mean_topic_prob), color = "#EE220C", fill = "#EE220C", alpha = 0.5, shape = 21, stroke = 0) + geom_point(aes(x = sum_coherence, y = estimate, size = control_mean_topic_prob), color = "#0076BA", fill = "#0076BA", alpha = 0.5, shape = 21, stroke = 0) + geom_text_repel(aes(x = sum_coherence, y = estimate, label = plot_label), max.overlaps = Inf, min.segment.length = 0, box.padding = 2.5, size = 3.0, segment.color = "blue" ) + facet_wrap(~ group) + ggtitle("Group Contrasts, PASC vs. Control") + scale_size_continuous(name = "Group Topic Usage\nPost Phase\n(Red: PASC, Blue: Control)") + scale_x_continuous(name = "Topic Coherence") + scale_y_continuous(name = "Group Contrast Effect Size (Log-Odds)") plot(p) p <- ggplot(full_data %>% filter(cohort_simple == "covid-no-pasc")) + geom_point(aes(x = sum_coherence, y = estimate, size = mean_topic_prob), color = "#EE220C", fill = "#EE220C", alpha = 0.5, shape = 21, stroke = 0) + geom_point(aes(x = sum_coherence, y = estimate, size = control_mean_topic_prob), color = "#0076BA", fill = "#0076BA", alpha = 0.5, shape = 21, stroke = 0) + geom_text_repel(aes(x = sum_coherence, y = estimate, label = plot_label), max.overlaps = Inf, min.segment.length = 0, box.padding = 2.5, size = 3.0, segment.color = "blue" ) + facet_wrap(~ group) + ggtitle("Group Contrasts, COVID vs. Control") + scale_size_continuous(name = "Group Topic Usage\nPost Phase\n(Red: COVID, Blue: Control)") + scale_x_continuous(name = "Topic Coherence") + scale_y_continuous(name = "Group Contrast Effect Size (Log-Odds)") plot(p) return(full_data) }