#* coherence_results_correlation:
#*   attr:
#*     fillcolor: '7'
#*   desc: Topic visualizations - coherence, statistical significance in test groups,
#*     mean and variance usage in test groups, etc.
#*   ext: R
#*   inputs:
#*   - model_tests_covid_pvals
#* 

library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(stringr)

coherence_results_correlation <- function(model_tests_covid_pvals, coherence_by_topic, group_topic_stats) {
    # do a quick check of correlation between mean and variance across cohorts, groups, epochs, and topics - mean and variance are correlated but lets quantify it
    print("Mean topic usage vs variance topic usage, across cohorts, groups, topics, epochs (post and pre), Spearman rank correlation:")
    print(cor(group_topic_stats$mean_topic_prob, group_topic_stats$var_topic_prob, method = "spearman"))
    print("Mean topic usage vs variance topic usage, across cohorts, groups, topics, epochs (post and pre), Pearson correlation:")
    print(cor(group_topic_stats$mean_topic_prob, group_topic_stats$var_topic_prob, method = "pearson"))

    # This is all self-inflicted pain due to my terrible naming conventions :/
    contrasts <- model_tests_covid_pvals %>%
        filter(!grepl("Overall", contrast)) %>% # keep only contrast tests
        extract(contrast,
                into = "cohort_simple",
                regex = ".*(COVID|PASC).*",
                remove = FALSE) %>%
        mutate(cohort_simple = case_when(
            cohort_simple == "COVID" ~ "covid-no-pasc",
            cohort_simple == "PASC" ~ "pasc",
            TRUE ~ cohort_simple
        )) %>%        
        extract(contrast,
                into = "group",
                regex = ".*(Adol|Adult|Senior|Ped|Female|Male|Early|Alpha|Delta).*",
                remove = FALSE) %>%
        mutate(group = tolower(group)) %>%
        mutate(group = case_when(
            group == "adol" ~ "adolescent",
            group == "ped" ~ "pediatric",
            TRUE ~ group
        )) %>%
    
    merge(group_topic_stats %>%
            filter(epoch == "post") %>%  # only post epoch for these visualizations
            mutate(group = case_when(
                group == "MALE" ~ "male",
                group == "FEMALE" ~ "female",
                TRUE ~ group
            )), 
            by = c("cohort_simple", "group", "topic_name"), all = TRUE) %>%

    mutate(plot_label = case_when(
        p_adj < 0.05 & (exp(estimate) > 2 | exp(estimate) < 0.5) ~ topic_name,
        TRUE ~ ""
    )) %>%

    mutate(group = case_when(
        group == "adolescent" ~ "Adolescent",
        group == "pediatric" ~ "Pediatric",
        group == "adult" ~ "Adult",
        group == "senior" ~ "Senior",
        group == "early" ~ "Early",
        group == "alpha" ~ "Alpha",
        group == "delta" ~ "Delta",
        group == "male" ~ "Male",
        group == "female" ~ "Female",
    ))

    control_data <- contrasts %>% filter(cohort_simple == "control") %>%
        select(group, topic_name, mean_topic_prob, var_topic_prob, count_patients) %>%
        rename(control_mean_topic_prob = mean_topic_prob, control_var_topic_prob = var_topic_prob, control_count_patients = count_patients)

    full_data <- contrasts %>% filter(cohort_simple != "control") %>% 
        select(grouper, cohort_simple, group, topic_name, plot_label, estimate, p_adj, mean_topic_prob, var_topic_prob, count_patients) %>%
        merge(control_data, by = c("group", "topic_name"), all = TRUE) %>%
        merge(coherence_by_topic, by = "topic_name")

    p <- ggplot(full_data %>% filter(cohort_simple == "pasc")) +
        geom_point(aes(x = sum_coherence, y = estimate, size = mean_topic_prob),
                   color = "#EE220C", fill = "#EE220C", alpha = 0.5, shape = 21, stroke = 0) +
        geom_point(aes(x = sum_coherence, y = estimate, size = control_mean_topic_prob),
                   color = "#0076BA", fill = "#0076BA", alpha = 0.5, shape = 21, stroke = 0) +
        geom_text_repel(aes(x = sum_coherence, y = estimate, label = plot_label),
                        max.overlaps = Inf,
                        min.segment.length = 0,
                        box.padding = 2.5,
                        size = 3.0,
                        segment.color = "blue"
                        ) +
        facet_wrap(~ group) +
        ggtitle("Group Contrasts, PASC vs. Control") +
        scale_size_continuous(name = "Group Topic Usage\nPost Phase\n(Red: PASC, Blue: Control)") +
        scale_x_continuous(name = "Topic Coherence") +
        scale_y_continuous(name = "Group Contrast Effect Size (Log-Odds)")

    plot(p)

    p <- ggplot(full_data %>% filter(cohort_simple == "covid-no-pasc")) +
        geom_point(aes(x = sum_coherence, y = estimate, size = mean_topic_prob),
                   color = "#EE220C", fill = "#EE220C", alpha = 0.5, shape = 21, stroke = 0) +
        geom_point(aes(x = sum_coherence, y = estimate, size = control_mean_topic_prob),
                   color = "#0076BA", fill = "#0076BA", alpha = 0.5, shape = 21, stroke = 0) +
        geom_text_repel(aes(x = sum_coherence, y = estimate, label = plot_label),
                        max.overlaps = Inf,
                        min.segment.length = 0,
                        box.padding = 2.5,
                        size = 3.0,
                        segment.color = "blue"
                        ) +
        facet_wrap(~ group) +
        ggtitle("Group Contrasts, COVID vs. Control") +
        scale_size_continuous(name = "Group Topic Usage\nPost Phase\n(Red: COVID, Blue: Control)") +
        scale_x_continuous(name = "Topic Coherence") +
        scale_y_continuous(name = "Group Contrast Effect Size (Log-Odds)")

    plot(p)

    return(full_data)
}