--* person_table_sets: --* attr: --* fillcolor: '2' --* desc: Assignes patient IDs to train, validate, and test sets pseudorandomly. --* ext: sql --* inputs: --* - person_lda_pin --* -- Selects a pseudo-random sample of person_ids, by hashing -- the person_id an and keeping the given percent, -- keeping the given percent is done by taking the hash -- taking the hash modulo 100 (with a concatenated -- seed to get a different psuedo-random set), and then -- keeping the ones with modulo less than the percent value. -- great thing is that the process is deterministic -- by person_id: the same set of person ids will be kept -- given the same seed, so this can be re-used across -- datasets to keep the same random subset of patients. SELECT *, CASE -- 80% train/validate WHEN abs(hash(concat(person_id, 'seed'))) % 100 < 80 THEN CASE -- 20% of train/validate as validate (aka hold_out) WHEN abs(hash(hash(person_id) + 42)) % 100 < 20 THEN 'validate' ELSE 'train' END ELSE 'test' END as set, make_date(year_of_birth, month_of_birth, 15) AS birth_date FROM Person_lda_pin