#* fishers_results_cleaned:
#*   attr:
#*     fillcolor: '7'
#*   desc: Reshape 2x2 results to wide format for publication.
#*   ext: R
#*   inputs:
#*   - fishers_results_top_topics
#* 

library(dplyr)
library(tidyr)
library(stringr)
#library(ggrepel)
#library(scales)

fishers_results_cleaned <- function(fishers_results_top_topics) {
    df <- fishers_results_top_topics %>%
        ## code to filter out results where the individual counts are less than 20. 
        ## removed currently since I _think_ it's ok to report e.g. p-values from small 2x2s
        ## Could also filter by sum of counts <= 20; there shouldn't be many significant of those
        #
        #   filter(control_nopre_nopost >= 20 &
        #          control_nopre_yespost >= 20 &
        #          test_nopre_nopost >= 20 &
        #          test_nopre_yespost >= 20
        #         ) %>%
          select(term_weight, cohort_simple, estimate, conf_int_lower, conf_int_upper, pval_adj, concept_id, concept_name, topic_name) %>%
          rename(top_topic_name = topic_name) 

    print(head(df, n = 20))

    sig_concepts <- df %>%
        filter(pval_adj < 0.05) %>%
        pull(concept_id) %>%
        unique()
        

    df_wide <- pivot_wider(df, names_from = "cohort_simple", values_from = c("estimate", "pval_adj", "conf_int_lower", "conf_int_upper"))

    df_filtered <- df_wide[df_wide$concept_id %in% sig_concepts, ]

    # remove extreme results (infs and 0s)
    # df_filtered <- df_filtered %>%
    #    filter(!is.infinite(estimate_pasc) & estimate_pasc != 0) %>%
    #    filter(!is.infinite(`estimate_covid-no-pasc`) & `estimate_covid-no-pasc` != 0) 

    # create a column indicating if the concept is sig for both cohorts or just one
    df_filtered$sig_category <- NA
    df_filtered$sig_category[df_filtered$pval_adj_pasc <= 0.05] <- "PASC"
    df_filtered$sig_category[df_filtered$`pval_adj_covid-no-pasc` <= 0.05] <- "COVID"
    df_filtered$sig_category[df_filtered$`pval_adj_covid-no-pasc` <= 0.05 & df_filtered$pval_adj_pasc <= 0.05] <- "Both"
    
    return(df_filtered)

}