#* gen_vocabulary:
#*   attr:
#*     fillcolor: '3'
#*   desc: Generate the vocabulary over all the data covering all used concept_ids, for
#*     later use in model training and usage.
#*   ext: py
#*   inputs:
#*   - LDA_arrayify_input
#* 

# here we use the vocab-generator and return just the python list containing the vocabulary generated
# by the vectorizer

from pyspark.ml.feature import CountVectorizer

def gen_vocabulary(LDA_arrayify_input):
    cv = CountVectorizer(inputCol = "concept_ids", outputCol = "concept_vector", binary = False, maxDF = 0.999999999)
    
    # fit it to the data to count the vocab term use
    cvmodel = cv.fit(LDA_arrayify_input)

    vocab = cvmodel.vocabulary 
    # its just a list of the terms from most-frequent to least
    print(vocab)

    # so we need to return it via to_pickle()
    return to_pickle(vocab)

#################################################
## Global imports and functions included below ##
#################################################

import pickle

def to_pickle(data):
    output = Transforms.get_output()
    output_fs = output.filesystem()
    
    with output_fs.open('data.pickle', 'wb') as f: 
        pickle.dump(data, f)

def from_pickle(transform_input):
    with transform_input.filesystem().open('data.pickle', 'rb') as f:    
        data = pickle.load(f)

    return data