--* OUTPUT_topic_descriptions:
--*   attr:
--*     fillcolor: '3'
--*   desc: Join nicer topic names to the topic descriptions. Additionally compute relevance
--*     metric for each term/topic pair and join in concept metadata.
--*   ext: sql
--*   inputs:
--*   - INPUT_persons_datapartners_concepts
--*   - concept_lda_pin
--*   - topic_description_decoded_unzipped
--* 

WITH topic_info AS (
    --- 3: from the join, pick these cols
    SELECT topic_table.topic_index, 
           topic_table.topic_name, 
           topic_table.term_weight,
           topic_table.concept_id,
           concept_lda_pin.concept_name, 
           concept_lda_pin.domain_id, 
           concept_lda_pin.concept_class_id, 
           concept_lda_pin.vocabulary_id
    FROM
    -- 1: unpack the input structs containing term weights and concept_ids
    (SELECT topic_index, topic_name,
        topicInfoStructs.termWeights AS term_weight,
        int(topicInfoStructs.termTerms) AS concept_id
    FROM topic_description_decoded_unzipped) AS topic_table
    -- 2: then join it on the concept_lda_pin table to get metadata
    LEFT JOIN concept_lda_pin ON topic_table.concept_id = concept_lda_pin.concept_id
), 

concept_global_prob AS (
        SELECT 
          group_counts.concept_id,
          -- 6: divide the concept_id counts by the total number of concepts to compute the percentage of data points that are that concept_lda_pin
          --   (its global probability)
          group_counts.count_concept_id / 
            (
                -- 4: compute the total number of rows and add it as a constant colum for later math
                SELECT count(*) FROM INPUT_persons_datapartners_concepts
            ) as concept_global_prob
        FROM (
          -- 5: count input concept_ids
          SELECT concept_id, count(*) as count_concept_id
          FROM INPUT_persons_datapartners_concepts
          GROUP BY concept_id) as group_counts
    ) 

-- 11: finally, we can compute topic relevance for the term within the topic, as a linear combination
--     (weighted by a gamma parameter) between basic term weight in the topic (but log'd) and 
--     the log of the probability of the term in the topic over the global probability of the term (aka lift)
-- 12: whew!
SELECT /*+ COALESCE(1) */ topic_index, 
       topic_name, 
       term_weight, 
       term_weight / term_sum_weight as term_weight_relative,
       0 * log2(term_weight) + (1.0 - 0) * log2(term_weight/concept_global_prob) as relevance,
       concept_id, 
       concept_name, 
       domain_id, 
       concept_class_id, 
       vocabulary_id
FROM (
    -- 10: from those keep the basic info about concepts for each topic, 
    --     as well as the sum of term weights across topics (same in multiple rows with the same concept_id)
    --     and the global probability of the concept_id (same in multiple rows with the same concept_id)
    SELECT topic_info.topic_index, 
           topic_info.topic_name,
           topic_info.term_weight,
           topic_info.concept_name, 
           topic_info.concept_id, 
           topic_info.domain_id, 
           topic_info.concept_class_id, 
           topic_info.vocabulary_id, 
           summed.term_sum_weight,
           concept_global_prob.concept_global_prob
    -- 8: join it with topic_info which contains a bunch of other info per topic and concept_id
    FROM topic_info
    JOIN (
        -- 7: from topic_info, for each concept_id, sum the weights given to it across all topics 
        --    (for use in later calculations of relevance)
        SELECT 
            concept_id, 
            SUM(term_weight) AS term_sum_weight
        FROM topic_info 
        GROUP BY concept_id) AS summed
    ON topic_info.concept_id = summed.concept_id 
    -- 9: also join in the global probability for concepts
    JOIN concept_global_prob ON topic_info.concept_id = concept_global_prob.concept_id)