#* person_topics_arrayify_epochs: #* attr: #* fillcolor: '4' #* desc: Shape model output of topic predictions for patient-epochs for use. #* ext: py #* inputs: #* - person_topic_assignment_epochs #* # the newly generated topicDistribution column contains lists of probabilities, describing how likely each person's concepts are drawn from each "topic" # this is returned as a sparse vector, which is a very pyspark-specific column type (it actually makes use of sparks "struct" data type and custom functions to work # with structs of that type) # fortunately there's a conversion function that converts such a sparse vector back to a regular array column from pyspark.ml.functions import vector_to_array def person_topics_arrayify_epochs(person_topic_assignment_epochs): # whil we're at it let's drop the the other columns - we just want to know the distribution of topics per person_id result = person_topic_assignment_epochs.withColumn("topic_dist_array", vector_to_array("topicDistribution")).drop("concept_vector").drop("topicDistribution").drop("concept_names") # in this output, the topic_dist_array column contains an array of the weights associated with each topic return result ################################################# ## Global imports and functions included below ## ################################################# import pickle def to_pickle(data): output = Transforms.get_output() output_fs = output.filesystem() with output_fs.open('data.pickle', 'wb') as f: pickle.dump(data, f) def from_pickle(transform_input): with transform_input.filesystem().open('data.pickle', 'rb') as f: data = pickle.load(f) return data