#* lda_model: #* attr: #* fillcolor: '3' #* desc: Train the model! Don't use the hold_out (validation) data. #* ext: py #* inputs: #* - vectorize_concepts #* # now that we have the data in the format needed, we can run the clustering algorithm! from pyspark.ml.clustering import LDA # Much like with the vectorizer, the LDA clustering model has two steps: # 1) fitting - where we identify the "topics" that are represented in the data # 2) transforming - where given some input data, we can identify which topics each record is about # since we'd again like to have two outputs (the description of the topics and some transformed data clustering them into topics) but can only send one, # we are going to return the model object, so that later we can use the model for the two purposes # if we tried to use our to_pickle() trick to return it though we'd get an error - pyspark.ml models are too fancy to serialize that way # fortunately, foundry defines some specific wrappers for serializing pyspark.ml (and sklearn) models, called Model and Stage. # oh yeah - foundry_ml isn't in the default package list, use the ML-resources profile to grab it in the environment or customize your environment to include it from foundry_ml import Model, Stage def lda_model(vectorize_concepts): # define the model and the number of topics (aka clusters) we'd like lda = LDA(k = 300, featuresCol = "concept_vector", seed = 42, maxIter = 200) nonholdout = vectorize_concepts.filter("hold_out = false") # fit it to the data to learn the topics; notice that it really only pays attention to the given featuresCol information ldamodel = lda.fit(nonholdout) # once fit, we can get info about the learned topics (as a data frame) with .describeTopics() # but we're not going to return that here, since we want to do other things with the model too # print(ldamodel.describeTopics()) # instead, we'll return the model itself using the foundry model-wrapping classes # We'll see in the next transform why it's Model(Stage(ldamodel)) instead of just Model(ldamodel) or something else # notice the icon for the return - it's a platform-specific data type return Model(Stage(ldamodel)) ################################################# ## Global imports and functions included below ## ################################################# import pickle def to_pickle(data): output = Transforms.get_output() output_fs = output.filesystem() with output_fs.open('data.pickle', 'wb') as f: pickle.dump(data, f) def from_pickle(transform_input): with transform_input.filesystem().open('data.pickle', 'rb') as f: data = pickle.load(f) return data