#* person_topics_arrayify_epochs:
#*   attr:
#*     fillcolor: '4'
#*   desc: Shape model output of topic predictions for patient-epochs for use.
#*   ext: py
#*   inputs:
#*   - person_topic_assignment_epochs
#* 

# the newly generated topicDistribution column contains lists of probabilities, describing how likely each person's concepts are drawn from each "topic"
# this is returned as a sparse vector, which is a very pyspark-specific column type (it actually makes use of sparks "struct" data type and custom functions to work
# with structs of that type)
# fortunately there's a conversion function that converts such a sparse vector back to a regular array column

from pyspark.ml.functions import vector_to_array

def person_topics_arrayify_epochs(person_topic_assignment_epochs):
    # whil we're at it let's drop the the other columns - we just want to know the distribution of topics per person_id
    result = person_topic_assignment_epochs.withColumn("topic_dist_array", vector_to_array("topicDistribution")).drop("concept_vector").drop("topicDistribution").drop("concept_names")

    # in this output, the topic_dist_array column contains an array of the weights associated with each topic
    return result

#################################################
## Global imports and functions included below ##
#################################################

import pickle

def to_pickle(data):
    output = Transforms.get_output()
    output_fs = output.filesystem()
    
    with output_fs.open('data.pickle', 'wb') as f: 
        pickle.dump(data, f)

def from_pickle(transform_input):
    with transform_input.filesystem().open('data.pickle', 'rb') as f:    
        data = pickle.load(f)

    return data