#* better_fisher_tests: #* attr: #* fillcolor: '7' #* desc: "Runs a 2x2 (two sided fishers with simulate.p.value = T) for each concept,\ #* \ \nwith patient counts summed across life stages, evaluating new onset rates\ #* \ in each test cohort\n(covid-no-pasc and pasc) against controls. \n" #* ext: R #* inputs: #* - concept_pre_post_patients #* library(dplyr) library(tidyr) better_fisher_tests <- function(concept_pre_post_patients) { # compute sums across life stages concept_pre_post_patients <- concept_pre_post_patients %>% select(-life_stage_simple) %>% group_by(cohort_simple, concept_id, concept_name, pre, post) %>% summarize(num_patients = sum(num_patients)) %>% as.data.frame() # split out the control cohort from the two case cohorts (covid-no-pasc and pasc) # we'll group_by case in the cases, and for each group pull in the controls from the outer scope to do the test in do_test # (it'll also be grouped by concept_id since were doing the test per-concept) controls <- concept_pre_post_patients %>% filter(cohort_simple == "control") cases <- concept_pre_post_patients %>% filter(cohort_simple != "control") do_test <- function(sub_df) { cat("Running test for ", head(sub_df, n = 1) %>% as.character(), "\n") # which concept_id are we talking about? test_concept_id <- sub_df$concept_id[1] # combine the control cohort and test cohort into one df, but only for the concept_id of interest sub_df <- rbind(sub_df, controls %>% filter(concept_id == test_concept_id)) print(sub_df) # compute the counts control_nopre_nopost <- sub_df %>% filter(cohort_simple == "control" & pre == 0 & post == 0) %>% pull(num_patients) control_nopre_yespost <- sub_df %>% filter(cohort_simple == "control" & pre == 0 & post == 1) %>% pull(num_patients) test_nopre_nopost <- sub_df %>% filter(cohort_simple != "control" & pre == 0 & post == 0) %>% pull(num_patients) test_nopre_yespost <- sub_df %>% filter(cohort_simple != "control" & pre == 0 & post == 1) %>% pull(num_patients) # in cases where there were no patients with a given concept, there may not be any count data (ie, there are no rows with num_patients == 0 # in the input, even though there are combinations with 0s). So mark it zero. MARK IT ZERO! if(length(control_nopre_nopost) == 0) {control_nopre_nopost = 0} if(length(control_nopre_yespost) == 0) {control_nopre_yespost = 0} if(length(test_nopre_nopost) == 0) {test_nopre_nopost = 0} if(length(test_nopre_yespost) == 0) {test_nopre_yespost = 0} # build the count matrix mat <- matrix(c(control_nopre_nopost, test_nopre_nopost, control_nopre_yespost, test_nopre_yespost), nrow = 2) print(mat) colnames(mat) <- c("no_post", "yes_post") rownames(mat) <- c("control", "test") # run the test # https://stats.stackexchange.com/questions/162718/problems-with-sample-size-in-fishers-exact-test res <- fisher.test(mat, alternative = "two.sided", simulate.p.value = TRUE) # extract results and return pval <- res$p.value conf_int_lower <- res$conf.int[1] conf_int_upper <- res$conf.int[2] estimate <- res$estimate ret_df <- data.frame(pval, estimate, conf_int_lower, conf_int_upper, control_nopre_nopost, test_nopre_nopost, control_nopre_yespost, test_nopre_yespost) return(ret_df) } # here's where we group the cases cases_grouped <- cases %>% group_by(cohort_simple, concept_id, concept_name) # and call the test function for each group results <- do(cases_grouped, do_test(.)) %>% as.data.frame() %>% mutate(pval_adj = p.adjust(pval, method = "bonferroni")) %>% mutate(pval_sig = pval_adj < 0.05) #filter(pval_adj < 0.05) return(results) } # this is old code, not meant to be run, but includes some ggplot # setup that I don't want to lose til I implement it in this branch old_code <- function() { library(dplyr) library(ggplot2) library(tidyr) library(stringr) library(RColorBrewer) top_concepts_testsA <- function(topic_top_concept_counts_groups) { df <- topic_top_concept_counts_groups df_pasc <- df %>% group_by(concept_id, concept_name, epoch, cohort_simple, life_stage) %>% summarize(num_patients = sum(num_patients)) %>% ungroup() %>% filter(cohort_simple %in% c("control", "pasc")) %>% mutate(cohort_simple = factor(cohort_simple, levels = c("control", "pasc"), labels = c("control", "test"))) %>% pivot_wider(names_from = c("cohort_simple", "epoch"), values_from = "num_patients") %>% mutate(test_cohort = "pasc") df_covid <- df %>% group_by(concept_id, concept_name, epoch, cohort_simple, life_stage) %>% summarize(num_patients = sum(num_patients)) %>% ungroup() %>% filter(cohort_simple %in% c("control", "covid-no-pasc")) %>% mutate(cohort_simple = factor(cohort_simple, levels = c("control", "covid-no-pasc"), labels = c("control", "test"))) %>% pivot_wider(names_from = c("cohort_simple", "epoch"), values_from = "num_patients") %>% mutate(test_cohort = "covid-no-pasc") # put counts <- rbind(df_pasc, df_covid) %>% as.data.frame() # NA's should be 0s counts[is.na(counts)] <- 0 counts <- counts %>% # we need at least 20 in one of the groups filter(!(control_post < 20 & control_pre < 20 & test_post < 20 & test_pre < 20)) counts <- rowwise(counts) res <- do(counts, run_test(.)) ret_df <- cbind(counts, res) %>% mutate(pval_adj = p.adjust(pval, method = "holm")) %>% filter(pval_adj < 0.05) %>% mutate(life_stage = factor(life_stage, levels = c("pediatric", "adolescent", "adult", "senior"), labels = c("Pediatric", "Adolescent", "Adult", "Senior"))) %>% mutate(test_cohort = factor(test_cohort, levels = c("covid-no-pasc", "pasc"), labels = c("COVID No PASC", "PASC"))) # print the incoming concept names as code for manual curation via copy/paste ret_df$concept_name %>% unique() %>% dput() # this is a lot of hand data entry... we'll unzip after for concept/category concepts_categories <- c( "Cough", "Respiratory", # resp "Dyspnea", "Respiratory", # resp "Wheezing", "Respiratory", # resp "Hypoxemia", "Respiratory", # resp "Chronic cough", "Respiratory", # resp "Acute respiratory distress syndrome", "Respiratory", # resp "Abnormal breathing", "Respiratory", # resp "Dependence on supplemental oxygen", "Respiratory", # resp "Disorder of respiratory system", "Respiratory", # resp "Acute conjunctivitis", "Infection", # infection "Conjunctivitis", "Infection", # infection "Viral disease in mother complicating pregnancy, childbirth AND/OR puerperium", "Infection",# infection "Pneumonia", "Infection", # infection "Viral pneumonia", "Infection", # infection "Pneumonia caused by SARS-CoV-2", "Infection", # infection "Acute lower respiratory tract infection", "Infection",# infection "Upper respiratory tract infection due to Influenza", "Infection",# infection "Chest pain", "Cardiac", # cardiac "Palpitations", "Cardiac", # cardiac "Tachycardia", "Cardiac", # cardiac "Cardiomegaly", "Cardiac", # cardiac "Myocarditis due to infectious agent", "Cardiac", # cardiac "Disorder of lung", "Pulmonary", # lung "Abnormal findings on diagnostic imaging of lung", "Pulmonary", # lung "Pneumothorax", "Pulmonary",# lung "Fibrosis of lung", "Pulmonary", # lung "Single live birth", "Pediatric", # pediatric "Diaper rash", "Pediatric", # pediatric "Fussy infant", "Pediatric", # pediatric "Headache", "Neurocognitive", # neuro "Dizziness and giddiness", "Neurocognitive", # neuro "Finding related to attentiveness", "Neurocognitive", # neuro "Cognitive communication disorder", "Neurocognitive", # neuro "Sensory disorder of smell and/or taste", "Neurocognitive", # neuro "Chronic fatigue syndrome", "Other", # other "Fatigue", "Other", # other "Malaise", "Other", # other "Asthenia", "Other", # other "Sleep disorder", "Other", # other "Hyperosmolality and or hypernatremia", "Other", # other "Telogen effluvium", "Other",# other "Non-scarring alopecia", "Other", # other "Imaging result abnormal", "Other", # other "Disorder due to infection", "Other" # other ) concept_name_vec <- concepts_categories[(1:length(concepts_categories)) %% 2 == 1] concept_category_vec <- concepts_categories[(1:length(concepts_categories)) %% 2 == 0] # we gotta do this before we cut ret_df down by merging with concept_df, to be used in the plot below for same scale fill_breaks <- seq(min(ret_df$estimate), max(ret_df$estimate), length.out = 5) %>% round(2) concepts_df <- data.frame(concept_name = concept_name_vec, concept_category = concept_category_vec, stringsAsFactors = FALSE) %>% filter(concept_category %in% c("Respiratory", "Infection", "Pulmonary")) ret_df <- ret_df %>% merge(concepts_df, by = "concept_name") %>% mutate(concept_name = factor(concept_name, levels = concepts_df$concept_name, labels = str_wrap(concepts_df$concept_name, 40))) palette <- brewer.pal(n = length(unique(concept_category_vec)), "Dark2") concepts_df <- concepts_df %>% #make this a factor with level ordering as specified above mutate(concept_category = factor(concept_category, # unique will keep the relative ordering by first occurrance levels = unique(concepts_df$concept_category))) %>% # this tricky business uses the fact that factors are integers under the hood, so we're indexing into the # palette with these underlying ints mutate(category_color = palette[concept_category]) p <- ggplot(ret_df) + geom_tile(aes(y = concept_name, x = life_stage, fill = estimate)) + facet_grid(. ~ test_cohort) + coord_equal() + scale_y_discrete(name = "Concept") + scale_x_discrete(name = "Life Stage") + scale_fill_viridis_c(name = "Odds Ratio\nEstimate", breaks = fill_breaks) + theme_bw(14) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + theme(axis.text.y = element_text(color = concepts_df$category_color)) + theme(legend.key.size = unit(0.5, 'cm')) + expand_limits(fill = c(min(fill_breaks), max(fill_breaks))) plot(p) return(to_rdata(p)) } # runs a chisq-test or fisher exact test on a single row run_test <- function(row_df) { a <- row_df$control_pre b <- row_df$control_post c <- row_df$test_pre d <- row_df$test_post res <- fisher.test(matrix(c(a, b, c, d), nrow = 2), alternative = "greater") pval <- res$p.value conf_int_lower <- res$conf.int[1] conf_int_upper <- res$conf.int[2] estimate <- res$estimate ret_df <- data.frame(pval, conf_int_lower, conf_int_upper, estimate) return(ret_df) } }