--* top_1000_terms_per_topic: --* attr: --* fillcolor: '3' --* desc: For data i/o performance select the top 1000 terms per topic by weight for --* plotting. --* ext: sql --* inputs: --* - renamed_OUTPUT_topic_descriptions --* -- for use downstream in topic cloud plots, since apparently I can't read in even medium-size datasets in R now -- (and it's easier to do this in SQL than SparkR) WITH numbered AS ( SELECT *, row_number() OVER (PARTITION BY topic_name ORDER BY term_weight DESC) as rownum FROM renamed_OUTPUT_topic_descriptions ) SELECT /*+ COALESCE(1) */ topic_name, term_weight, relevance, concept_id, concept_name FROM numbered WHERE rownum <= 1000