#* vectorize_concepts_epochs: #* attr: #* fillcolor: '4' #* desc: Shaping for model prediction. #* ext: py #* inputs: #* - LDA_arrayify_input_epochs #* - gen_vocabulary #* # now we can do part 2 of the vectorization process, using the vocab list to generate the term usage vectors from pyspark.ml.feature import CountVectorizerModel def vectorize_concepts_epochs( LDA_arrayify_input_epochs, gen_vocabulary): # read in the python list of the vocab vocab_list = from_pickle(gen_vocabulary) # fortunately it's possible to build a vectorizer from a pre-computed vocab list such as we have # I'm also using binary = True to tell it that when counting occurrances we don't really need counts, just presence/absence (better for EHR data? you decide...) cvmodel = CountVectorizerModel.from_vocabulary(vocab_list, inputCol = "concept_ids", outputCol = "concept_vector", binary = False) # maxDF already incorporated...(?) # now we do the 'transform' (vectorization) and return it transformed_data = cvmodel.transform(LDA_arrayify_input_epochs) return transformed_data ################################################# ## Global imports and functions included below ## ################################################# import pickle def to_pickle(data): output = Transforms.get_output() output_fs = output.filesystem() with output_fs.open('data.pickle', 'wb') as f: pickle.dump(data, f) def from_pickle(transform_input): with transform_input.filesystem().open('data.pickle', 'rb') as f: data = pickle.load(f) return data