#* analysis_r_code:
#*   attr:
#*     fillcolor: cyan
#*   desc: 'The code in this node is available globally to all nodes downstream of it,
#*     and contains much of the statistical and visualization functions. (Developer note:
#*     It is not actually executed, but a copy of the global code section of the code
#*     workbook which is not currently parsed by the export parser consistently for some
#*     reason.)'
#*   ext: R
#*   inputs: []
#* 

analysis_r_code <- function() {

    # wrapper function that just calls run_model 
    # designed to catch and handle errors, and be friendly to sparkr gapply() (never got that to work)
    wrapper <- function(key_list, data_df) {
        library(tidyr)
        library(dplyr)
        library(geepack)
        library(emmeans)
        library(dplyr)
        library(stringr)
        # test <- data.frame(topic = unique(data_df$topic_name))
        # test$
        topics <- unique(data_df$topic_name)
        # return(test)
        cat("\n\nRunning topic: ", key_list[[1]], " at ", as.character(Sys.time()), "\n")

        model_results <- run_model(data_df)
        model_results$topic_name <- key_list[[1]]
        return(model_results)

        result <- tryCatch({
            model_results <- run_model(data_df)
            model_results$topic_name <- key_list[[1]]
            # I guess we cannot use return() in the expression here for tryCatch
            model_results
        }, error=function(cond){
            cat("Got error, returning no result: ", as.character(cond), "\n")
            return(NULL)
        }, finally={
            #cat("Finished with topic ", key_list[[1]], "\n\n")
        })

        return(result)
    }

    # run the model and contrasts with data for a single topic
    run_model <- function(data_df) {
        #library(PRROC)

        # make sure data in right format
        data_df$data_partner_id <- factor(data_df$data_partner_id)
        data_df$cci_score_quan <- as.integer(data_df$cci_score_quan)

        # set reference levels that make sense
        data_df$index_wave <- relevel(factor(data_df$index_wave), ref = "early")
        data_df$life_stage_simple <- relevel(factor(data_df$life_stage_simple), ref = "adult")
        data_df$cdm_name <- relevel(factor(data_df$cdm_name), ref = "OMOP")
        data_df$cohort_simple <- relevel(factor(data_df$cohort_simple), ref = "control")
        data_df$race <- relevel(factor(data_df$race), ref = "White")

        # compute per-data-partner covariates for percent of patients w/ PASC and percent of patients w/ COVID
        # (amongst this complete test data)

        # input: data frame from data_df with only one data partner
        compute_pasc_density <- function(sub_df) {
            # each patient is represented twice, by a pre datapoint and a post datapoint, hence the unique()s
            total_persons <- length(unique(sub_df$person_id))
            total_pasc <- sub_df %>% filter(cohort_simple == "pasc") %>% pull(person_id) %>% unique() %>% length() 
            total_covid <- sub_df %>% filter(cohort_simple == "covid-no-pasc") %>% pull(person_id) %>% unique() %>% length() 
            return(data.frame(
                percent_pasc = total_pasc / total_persons,
                percent_covid = total_covid / total_persons
            ))
        }

        data_df_by_dp <- data_df %>% group_by(data_partner_id)
        dp_pasc_covid_densities <- do(data_df_by_dp, compute_pasc_density(.))

        data_df <- merge(data_df, dp_pasc_covid_densities, by = "data_partner_id", all.x = TRUE)

        # now let's compute the relative usage of this topic by data partner, which is defined, for a given data partner,
        # as the sum of weights assigned to the topic divided by the number of data points (and so would be 1.0 if all data points at
        # that data partner are entirely within this topic). Note that data points are pre and post for each patient, so we're summing pre and post
        # probabilities and dividing by number of patients * 2

        # patients aren't duplicated within a topic so don't really have to unique() here 
        data_point_counts_by_dp <- data_df %>% group_by(data_partner_id) %>% summarize(num_records = length(person_id)) 
        sum_weights_by_dp <- data_df %>% group_by(data_partner_id) %>% summarize(sum_weight = sum(topic_prob))
        together <- merge(data_point_counts_by_dp, sum_weights_by_dp, by = "data_partner_id") %>%
        mutate(dp_relative_topic_usage = sum_weight / num_records) %>%
        select(data_partner_id, dp_relative_topic_usage)
        data_df <- merge(data_df, together, by = "data_partner_id", all.x = TRUE)
        

        # apparently the input should be ordered by cluster:
        # https://stackoverflow.com/questions/51581103/gee-regression-gives-nan-values-as-result-in-a-dataframe-subset
        # this is also menteiond in the docs: https://www.rdocumentation.org/packages/geepack/versions/1.3.4/topics/geeglm
        # I didn't get any warnings so it's already sorted that way, but just in case:
        data_df <- data_df[order(data_df$person_id, data_df$epoch),]

        print("Structure of input data:")
        str(data_df)
        # consideration w/ Brenda and Ken; subset to data partners w/ enough data to build an effective model w/ data_partner_id as random effective
        # or control for data partner level covariates (e.g. does_data_partner_have_any_pasc_patients, maybe not this exactly, or an interaction term w/ it in the )
        model <- geeglm(topic_prob ~ 
                                epoch * cohort_simple * (index_wave + gender + life_stage_simple) + 
                                percent_pasc * epoch * cohort_simple + 
                                dp_relative_topic_usage +
                                race + 
                                cci_score_quan +
                                bmi +
                                #data_partner_id, # fails convergence, causes complete/quasi-complete separation :/ 
                                cdm_name,
                family=binomial(),
                # for repeated measures (two measures, exchangeable corstr)
                id = person_id, 
                corstr="exchangeable",
                # weight all epochs of all patients equally
                # - weighting by actual count of conditions per patient/epoch results in much higher
                #   weight for high-utilization (very sick) patients
                weights = rep(1, nrow(data_df)),
                scale.fix = FALSE,                 # check w/ Ken or Brenda; possible TODO: output the fitted scale estimate
                data = data_df)
        

        # this is going to collect a list of contrast results, each represented by a 
        # data frame of rows for multiple contrasts (so we have groups of contrasts for different purposes)
        all_contrasts_list <- list()

        #####
        ##### Overall pre to post
        #####

        em <- emmeans(model, specs = ~ epoch)
        # generate the contrast vectors from the marginal means spec
        # (see comments on make_var_vectors())
        v <- make_var_vectors(em)

        # emmeans can run multiple contrasts given as a named list of
        # contrast vector combinations; do_contrasts() calls it and parses the output 
        # returning a data frame
        #   btw: I end up needing to parse these names, but I made them hard to parse.
        #        This code takes ~16 hours to run, so it was faster to just write the hard parsing code
        tests <- list(
            `Overall: Post / Pre` = v$'post' - v$'pre',
            `Overall: Post Pre Mean` = (v$'post' + v$'pre')/2
        )
        # add the resulting data frame to the list of contrast results.
        # we wrap the result of do_contrasts() in list() so that we can 
        # concatenate the element to the growing list (otherwise
        # c() treats the returned dataframe as a list and all hell breaks loose)
        all_contrasts_list <- c(all_contrasts_list, list(do_contrasts(em, v, tests, "Overall: Post Pre")))

        #####
        ##### Gender-related contrasts
        #####

        em <- emmeans(model, specs = ~ gender)
        # generate the contrast vectors from the marginal means spec
        # (see comments on make_var_vectors())
        v <- make_var_vectors(em)

        # emmeans can run multiple contrasts given as a named list of
        # contrast vector combinations; do_contrasts() calls it and parses the output 
        # returning a data frame
        #   btw: I end up needing to parse these names, but I made them hard to parse.
        #        This code takes ~16 hours to run, so it was faster to just write the hard parsing code
        tests <- list(
            `Overall: Female / Male` = v$'FEMALE' - v$'MALE'
        )
        # add the resulting data frame to the list of contrast results.
        # we wrap the result of do_contrasts() in list() so that we can 
        # concatenate the element to the growing list (otherwise
        # c() treats the returned dataframe as a list and all hell breaks loose)
        all_contrasts_list <- c(all_contrasts_list, list(do_contrasts(em, v, tests, "Gender Checks")))

        ##### Gender-related contrasts
        em <- emmeans(model, specs = ~ cohort_simple + gender + epoch)
        v <- make_var_vectors(em)

        tests <- list(
            `Migration: Female COVID+ / Control` = (v$'covid-no-pasc:FEMALE:post' - v$'covid-no-pasc:FEMALE:pre') - 
                                    (v$'control:FEMALE:post' - v$'control:FEMALE:pre'),
            `Migration: Male COVID+ / Control` = (v$'covid-no-pasc:MALE:post' - v$'covid-no-pasc:MALE:pre') - 
                                    (v$'control:MALE:post' - v$'control:MALE:pre'),

            `Migration: Female PASC+ / Control` = (v$'pasc:FEMALE:post' - v$'pasc:FEMALE:pre') - 
                                    (v$'control:FEMALE:post' - v$'control:FEMALE:pre'),
            `Migration: Male PASC+ / Control` = (v$'pasc:MALE:post' - v$'pasc:MALE:pre') - 
                                    (v$'control:MALE:post' - v$'control:MALE:pre')
        )
        all_contrasts_list <- c(all_contrasts_list, list(do_contrasts(em, v, tests, "Gender Contrasts")))

        #####
        ##### Life-stage -related contrasts
        #####
        em <- emmeans(model, specs = ~ cohort_simple + life_stage_simple + epoch)
        v <- make_var_vectors(em)

        tests <- list(
            # COVID+ (no PASC)

            `Migration: COVID+ Ped. / Control Ped.` = (v$'covid-no-pasc:pediatric:post' - v$'covid-no-pasc:pediatric:pre') -
                                                    (v$'control:pediatric:post' - v$'control:pediatric:pre'),

            `Migration: COVID+ Adol. / Control Adol.` = (v$'covid-no-pasc:adolescent:post' - v$'covid-no-pasc:adolescent:pre') -
                                                    (v$'control:adolescent:post' - v$'control:adolescent:pre'),

            `Migration: COVID+ Adult / Control Adult` = (v$'covid-no-pasc:adult:post' - v$'covid-no-pasc:adult:pre') -
                                                (v$'control:adult:post' - v$'control:adult:pre'),

            `Migration: COVID+ Senior / Control Senior` = (v$'covid-no-pasc:senior:post' - v$'covid-no-pasc:senior:pre') -
                                                (v$'control:senior:post' - v$'control:senior:pre'),

            # PASC+ (possibly COVID+ as well)

            `Migration: PASC+ Ped. / Control Ped. ` = (v$'pasc:pediatric:post' - v$'pasc:pediatric:pre') -
                                                (v$'control:pediatric:post' - v$'control:pediatric:pre'),

            `Migration: PASC+ Adol. / Control Adol.` = (v$'pasc:adolescent:post' - v$'pasc:adolescent:pre') -
                                                    (v$'control:adolescent:post' - v$'control:adolescent:pre'),

            `Migration: PASC+ Adult / Control Adult` = (v$'pasc:adult:post' - v$'pasc:adult:pre') -
                                                (v$'control:adult:post' - v$'control:adult:pre'),

            `Migration: PASC+ Senior / Control Senior` = (v$'pasc:senior:post' - v$'pasc:senior:pre') -
                                                (v$'control:senior:post' - v$'control:senior:pre')
        )
        all_contrasts_list <- c(all_contrasts_list, list(do_contrasts(em, v, tests, "Life Stage Contrasts")))
        

        # #####
        # ##### Life-stage -related contrasts, sanity check
        #####
        em <- emmeans(model, specs = ~ life_stage_simple)
        v <- make_var_vectors(em)

        tests <- list(
            `Overall: Pediatric / Adult` = v$'pediatric' - v$'adult',

            `Overall: Adolescent / Adult` = v$'adolescent' - v$'adult',

            `Overall: Senior / Adult` = v$'senior' - v$'adult'

        )
        all_contrasts_list <- c(all_contrasts_list, list(do_contrasts(em, v, tests, "Life Stage Checks")))
        

        #####
        ##### Wave-related contrasts
        #####
        em <- emmeans(model, specs = ~ cohort_simple + index_wave + epoch)
        v <- make_var_vectors(em)

        tests <- list(
            # COVID+

            `Migration: COVID+ Early / Control Early` = (v$'covid-no-pasc:early:post' - v$'covid-no-pasc:early:pre') -
                                                (v$'control:early:post' - v$'control:early:pre'),

            `Migration: COVID+ Alpha / Control Alpha` = (v$'covid-no-pasc:alpha:post' - v$'covid-no-pasc:alpha:pre') -
                                                (v$'control:alpha:post' - v$'control:alpha:pre'),

            `Migration: COVID+ Delta / Control Delta` = (v$'covid-no-pasc:delta:post' - v$'covid-no-pasc:delta:pre') - 
                                                (v$'control:delta:post' - v$'control:delta:pre'),

            # # PASC+
            `Migration: PASC+ Early / Control Early` = (v$'pasc:early:post' - v$'pasc:early:pre') -
                                                (v$'control:early:post' - v$'control:early:pre'),

            `Migration: PASC+ Alpha / Control Alpha` = (v$'pasc:alpha:post' - v$'pasc:alpha:pre') -
                                                (v$'control:alpha:post' - v$'control:alpha:pre'),

            `Migration: PASC+ Delta / Control Delta` = (v$'pasc:delta:post' - v$'pasc:delta:pre') - 
                                                (v$'control:delta:post' - v$'control:delta:pre')#,
        )
        all_contrasts_list <- c(all_contrasts_list, list(do_contrasts(em, v, tests, "Wave Contrasts")))

        #####
        ##### Has pasc patients tests
        #####

        # em <- emmeans(model, specs = ~ has_pasc_patients)
        # # generate the contrast vectors from the marginal means spec
        # # (see comments on make_var_vectors())
        # v <- make_var_vectors(em)

        # # emmeans can run multiple contrasts given as a named list of
        # # contrast vector combinations; do_contrasts() calls it and parses the output 
        # # returning a data frame
        # #   btw: I end up needing to parse these names, but I made them hard to parse.
        # #        This code takes ~16 hours to run, so it was faster to just write the hard parsing code
        # tests <- list(
        #     `Overall: has pasc` = v$'TRUE' - v$'FALSE'
        # )
        # # add the resulting data frame to the list of contrast results.
        # # we wrap the result of do_contrasts() in list() so that we can 
        # # concatenate the element to the growing list (otherwise
        # # c() treats the returned dataframe as a list and all hell breaks loose)
        # all_contrasts_list <- c(all_contrasts_list, list(do_contrasts(em, v, tests, "Overall: has pasc")))

        # now combine all the results data frames into one and return it
        ret_df <- do.call(rbind, all_contrasts_list)
        return(ret_df)
    }

    # create forest-plot-like plot. takes all the topic results from upstream and a specification of the subset of topics to plot. Contrast filter is used to specify
    # which group of contrasts to plot (sanity checks, contrasts for PASC cohort, or 
    # contrasts for COVID cohort)
    # unique_topics is the ordered full list of unique topic names, and could 
    # be computed here rather than passed as a separate var 
    do_plot_clean <- function(df, unique_topics, topics_subset, contrast_filter = "Migration", title = "Odds Ratios") {

        # some topics the model diverged (small groups being fully seperable)
        # we don't want to drop them altogether (or we'd lose the facet in later plots), just
        # replace entries with NA so the lines/points aren't plotted
        failed_convergence_topics <- df %>% 
                                        filter(abs(estimate) > 1000) %>%
                                        pull(topic_name) %>%
                                        unique()

        # create a contrast_renamed column for plotting
        unique_contrast_group_combos <- df[, c("contrast", "group")] %>% 
            unique() %>%
            arrange(group) %>%
            mutate(contrast_renamed = str_replace(contrast, 
                                        "^Migration:.+((Early)|(Delta)|(Alpha)|(Ped.)|(Adult)|(Adol.)|(Male)|(Female)|(Senior)).*$",
                                        "\\1")) %>%
            mutate(contrast_renamed = str_replace(contrast_renamed, 
                                    "(Overall: )",
                                        "")) %>%
            mutate(contrast_renamed = case_when(
                contrast_renamed == "Ped." ~ "Pediatric",
                contrast_renamed == "Adol." ~ "Adolescent",
                TRUE ~ contrast_renamed
            ))

        df <- df %>%
                # for topics that failed convergence, we don't want to plot the lines, 
                # but we don't want to drop them altogether (or we'd lose the facet), so just
                # replace entries with NA so the lines/points aren't plotted
                mutate(p_adj = ifelse(topic_name %in% failed_convergence_topics, NA, p_adj)) %>%
                mutate(estimate = ifelse(topic_name %in% failed_convergence_topics, NA, estimate)) %>%
                mutate(estimate_exp = ifelse(topic_name %in% failed_convergence_topics, NA, exp(estimate))) %>%
                mutate(asymp_LCL = ifelse(topic_name %in% failed_convergence_topics, NA, asymp_LCL)) %>%
                mutate(asymp_UCL = ifelse(topic_name %in% failed_convergence_topics, NA, asymp_UCL)) %>%
                mutate(p_value = ifelse(topic_name %in% failed_convergence_topics, NA, p_value)) %>%
                mutate(p_adj_small = ifelse(topic_name %in% failed_convergence_topics, NA, p_adj_small)) %>%

                # filter to topics of interest
                filter(topic_name %in% topics_subset) %>% 
                # order the panels by topic number
                mutate(topic_name = factor(topic_name, levels = unique_topics)) %>%
                # keep only the contrast groups we asked for
                filter(str_detect(contrast, contrast_filter)) %>%
                filter(contrast != "Overall: Post Pre Mean" & contrast != "Overall: Post / Pre") %>%
                # give the contrasts nicer names for plotting
                mutate(contrast = factor(contrast, 
                                        levels = unique_contrast_group_combos$contrast,
                                        labels = unique_contrast_group_combos$contrast_renamed))

        df$contrast <- factor(df$contrast, levels = c("Female", 
                                                    "Male", 
                                                    "Pediatric",
                                                    "Adolescent",
                                                    "Adult",
                                                    "Senior",
                                                    "Early",
                                                    "Alpha", 
                                                    "Delta",
                                                    "Post / Pre",
                                                    "Female / Male",
                                                    "Pediatric / Adult",
                                                    "Adolescent / Adult",
                                                    "Senior / Adult"),
                                            ordered =TRUE)

        print(summary(df))

        p <- ggplot(df) +
            annotation_raster(alpha("black", 0.05), xmin = -Inf, xmax = Inf, ymin = 3.5, ymax = 7.5) +
            geom_vline(xintercept = 1, linetype = "dashed", color = "#FF8888") + 
            geom_linerange(aes(xmin = exp(asymp_LCL), 
                            xmax = exp(asymp_UCL), 
                            y = contrast, 
                            color = group,
                            #color = contrast,
                            group = group, 
                            alpha = p_adj_small)) + 
            geom_point(aes(x = exp(estimate), 
                        y = contrast, 
                        color = group,
                        #color = contrast,
                        group = group, 
                        alpha = p_adj_small)) +
            scale_x_continuous(name = "Odds Ratio (95% CI)", trans = "log10", labels = label_number()) +
            scale_y_discrete(limits = rev) + # flip the y axis to put female/male on top
            annotation_logticks(sides = "b") +
            facet_wrap( ~ topic_name, scales = "free_x") +
            theme_bw(7) +
            theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
            scale_color_brewer(palette = "Set1") +
            #scale_color_manual(values = c(
            #     "Female" = "pink",
            #     "Male" = "blue",
            #     "Pediatric" = "green",
            #     "Adolescent" = "blue",
            #     "Adult" = "red",
            #     "Senior" = "orange",
            #     "Early" = "blue",
            #     "Alpha" = "green",
            #     "Delta" = "orange" 
            # )) +
            scale_alpha_discrete(name = "p < 0.05 (Holm corrected)", range = c(0.25,1)) +
            ggtitle(title) +
            theme(strip.text.x = element_text(size = 6, margin(0, 0, 0, 0, "cm"))) +
            theme(aspect.ratio = 0.75)
    
        plot(p)
        return(p)

    }

    # this page describes emmeans custom contrasts
    # using 1-hot encoded vectors: https://aosmith.rbind.io/2019/04/15/custom-contrasts-emmeans/
    # this function automates the process, resulting in a list of these vectors
    # indexed by level name of the the contrast variable
    ## NOTE: since the names of the vectors are determined by the levels of the 
    ##  contrast variable, use alphanumeric+_- levels, or be willing to access as e.g. vars$`Level 1`
    make_var_vectors <- function(em) {
    # use contrast() from emmeans to compute the contrasts, 
    # grabbing the result as dataframe and grabbing the first column of 
    # contrast names, in order as per 
    #contrast_names <- as.data.frame(em)[[1]] %>% as.character()
    paste_colon <- function(...) {
        return(paste(..., sep = ":"))
    }
    contrast_names <- as.data.frame(em) %>% 
                        select(-emmean, -SE, -asymp.LCL, -asymp.UCL, -df) %>% 
                        lapply(as.character) %>% 
                        do.call(paste_colon, .)
    
    n_contrasts <- length(contrast_names)
    
    # create a list of one-hot vectors in increasing order
    var_vectors <- 1:n_contrasts %>% lapply(function(index) {
                                                base_vec <- rep(0, n_contrasts) # start with the right number of 0s
                                                base_vec[index] <- 1            # set the current index to one
                                                return(base_vec)
                                            }) 
    
    # name the list elements so they can be accessed by name
    names(var_vectors) <- contrast_names
    return(var_vectors)
    }

    # given an emmeans object, v (result of make_var_vectors(em), and tests
    # (a named list of tests encoded as vectors), returns a single-row dataframe
    # with stats on the tests run. A group name can be given to indicate which 'group'
    # these contrasts belong to (e.g. life stage tests, wave tests, etc.)
    do_contrasts <- function(em, v, tests, group = NA) {  
        print("Running contrasts:")
        print(tests)
        print(em)  
        contrasts <- contrast(em, method = tests)

        contrasts_confint_df <- contrasts %>%
            confint() %>%
            as.data.frame() %>%
            select(contrast, estimate, asymp.LCL, asymp.UCL) 

        contrasts_pval_df <- contrasts %>%
            test() %>%
            as.data.frame() %>%
            select(contrast, p.value)

        res <- merge(contrasts_pval_df, contrasts_confint_df, by = "contrast")
        res$group <- group
        res$contrast <- as.character(res$contrast)

        return(res)
    }

    # given a *list* of equal-length vectors, computes the element-wise mean
    # (not used currently)
    vmean_list <- function(vec_list) {
    mat <- do.call(cbind, vec_list)
    return(rowMeans(mat))
    }

    # given a bunch of vectors of the same length (not in a list), returns an
    # element-wise mean vector
    # (not used currently)
    vmean <- function(...) {
    vec_list <- list(...)
    mat <- do.call(cbind, vec_list)
    return(rowMeans(mat))
    }

    to_rdata <- function(data) {
        output <- new.output()
        output_fs <- output$fileSystem()
        saveRDS(data, output_fs$get_path("data.rds", 'w'))
    }

    from_rdata <- function(data) {
        fs <- data$fileSystem()

        path <- fs$get_path("data.rds", 'r')
        rds <- readRDS(path)
        return(rds)
    }

    return(NULL)
}