diff --git a/scripts/westminster_gnomad_folds.py b/scripts/westminster_gnomad_folds.py new file mode 100644 index 0000000..00a1fe1 --- /dev/null +++ b/scripts/westminster_gnomad_folds.py @@ -0,0 +1,491 @@ +#!/usr/bin/env python +# Copyright 2023 Calico LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +from optparse import OptionParser, OptionGroup +import glob +import pickle +import pdb +import os + +import h5py +import numpy as np + +import slurm + +import westminster +from westminster.multi import collect_scores, nonzero_h5 + +""" +westminster_gnomad_folds.py + +Benchmark Baskerville model replicates on Gnomad common vs. rare classification task. +""" + +################################################################################ +# main +################################################################################ +def main(): + usage = 'usage: %prog [options] ' + parser = OptionParser(usage) + + # snp options + snp_options = OptionGroup(parser, 'hound_snp.py options') + parser.add_option("-c", dest="cluster_snps_pct", + default=0, type="float", + help="Cluster SNPs within a %% of the seq length to make a single ref pred [Default: %default]") + snp_options.add_option('-f', dest='genome_fasta', + default='/home/anya/workspace/genome/hg38.fa', + help='Genome FASTA for sequences [Default: %default]') + snp_options.add_option('-o',dest='out_dir', + default='gtex', + help='Output directory for tables and plots [Default: %default]') + snp_options.add_option('--rc', dest='rc', + default=False, action='store_true', + help='Average forward and reverse complement predictions [Default: %default]') + snp_options.add_option('--shifts', dest='shifts', + default='0', type='str', + help='Ensemble prediction shifts [Default: %default]') + snp_options.add_option("--stitch", dest="stitch", + default=False, action="store_true", + help=" [Default: %default]") + snp_options.add_option('--stats', dest='snp_stats', + default='logSAD', + help='Comma-separated list of stats to save. [Default: %default]') + snp_options.add_option('-t', dest='targets_file', + default='/scratch3/drk/seqnn/data/v9/hg38/targets.txt', type='str', + help='File specifying target indexes and labels in table format') + snp_options.add_option("-u", dest="untransform_old", + default=False, action="store_true", + help="Untransform old models [Default: %default]") + parser.add_option_group(snp_options) + + # classify + class_options = OptionGroup(parser, 'westminster_classify.py options') + class_options.add_option('--cn', dest='class_name', + default=None, + help='Classifier name extension [Default: %default]') + class_options.add_option('--ct', dest='class_targets_file', + default=None, + help='Targets slice for the classifier stage [Default: %default]') + class_options.add_option('--msl', dest='msl', + default=1, type='int', + help='Random forest min_samples_leaf [Default: %default]') + parser.add_option_group(class_options) + + # cross-fold + fold_options = OptionGroup(parser, 'cross-fold options') + fold_options.add_option('--cross', dest='crosses', + default=1, type='int', + help='Number of cross-fold rounds [Default:%default]') + fold_options.add_option('-d', dest='data_head', + default=None, type='int', + help='Index for dataset/head [Default: %default]') + fold_options.add_option('-e', dest='conda_env', + default='tf12', + help='Anaconda environment [Default: %default]') + fold_options.add_option('--folds', dest='num_folds', + default=None, type='int', + help='Number of folds to evaluate [Default: %default]') + fold_options.add_option('-g', dest='gtex_vcf_dir', + default='/home/drk/seqnn/data/gtex_fine/susie_pip90r') + fold_options.add_option('--name', dest='name', + default='gtex', help='SLURM name prefix [Default: %default]') + fold_options.add_option('--max_proc', dest='max_proc', + default=None, type='int', + help='Maximum concurrent processes [Default: %default]') + fold_options.add_option('-p', dest='processes', + default=None, type='int', + help='Number of processes, passed by multi script. \ + (Unused, but needs to appear as dummy.)') + fold_options.add_option('-q', dest='queue', + default='geforce', + help='SLURM queue on which to run the jobs [Default: %default]') + parser.add_option_group(fold_options) + + (options, args) = parser.parse_args() + + if len(args) != 2: + parser.error('Must provide parameters file and cross-fold directory') + else: + params_file = args[0] + exp_dir = args[1] + + ####################################################### + # prep work + + # count folds + if options.num_folds is None: + options.num_folds = 0 + fold0_dir = '%s/f%dc0' % (exp_dir, options.num_folds) + model_file = '%s/train/model_best.h5' % fold0_dir + if options.data_head is not None: + model_file = '%s/train/model%d_best.h5' % (fold0_dir, options.data_head) + while os.path.isfile(model_file): + options.num_folds += 1 + fold0_dir = '%s/f%dc0' % (exp_dir, options.num_folds) + model_file = '%s/train/model_best.h5' % fold0_dir + if options.data_head is not None: + model_file = '%s/train/model%d_best.h5' % (fold0_dir, options.data_head) + print('Found %d folds' % options.num_folds) + if options.num_folds == 0: + exit(1) + + # extract output subdirectory name + gtex_out_dir = options.out_dir + + # split SNP stats + snp_stats = options.snp_stats.split(',') + + # merge study/tissue variants + mpos_vcf_file = '%s/pos_merge.vcf' % options.gtex_vcf_dir + mneg_vcf_file = '%s/neg_merge.vcf' % options.gtex_vcf_dir + + ################################################################ + # SAD + + # SAD command base + cmd_base = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' + cmd_base += ' conda activate %s;' % options.conda_env + cmd_base += ' echo $HOSTNAME;' + + jobs = [] + run_hound = True + + for ci in range(options.crosses): + for fi in range(options.num_folds): + + if not run_hound: + continue + + it_dir = '%s/f%dc%d' % (exp_dir, fi, ci) + name = '%s-f%dc%d' % (options.name, fi, ci) + + # update output directory + it_out_dir = '%s/%s' % (it_dir, gtex_out_dir) + os.makedirs(it_out_dir, exist_ok=True) + + # choose model + model_file = '%s/train/model_best.h5' % it_dir + if options.data_head is not None: + model_file = '%s/train/model%d_best.h5' % (it_dir, options.data_head) + + ######################################## + # negative jobs + + # pickle options + options.out_dir = '%s/merge_neg' % it_out_dir + os.makedirs(options.out_dir, exist_ok=True) + options_pkl_file = '%s/options.pkl' % options.out_dir + options_pkl = open(options_pkl_file, 'wb') + pickle.dump(options, options_pkl) + options_pkl.close() + + # create base fold command + cmd_fold = '%s time hound_snp.py %s %s %s' % ( + cmd_base, options_pkl_file, params_file, model_file) + + for pi in range(options.processes): + sad_file = '%s/job%d/scores.h5' % (options.out_dir, pi) + if not nonzero_h5(sad_file, snp_stats): + cmd_job = '%s %s %d' % (cmd_fold, mneg_vcf_file, pi) + j = slurm.Job(cmd_job, '%s_neg%d' % (name,pi), + '%s/job%d.out' % (options.out_dir,pi), + '%s/job%d.err' % (options.out_dir,pi), + '%s/job%d.sb' % (options.out_dir,pi), + queue=options.queue, gpu=1, cpu=2, + mem=30000, time='7-0:0:0') + jobs.append(j) + + ######################################## + # positive jobs + + # pickle options + options.out_dir = '%s/merge_pos' % it_out_dir + os.makedirs(options.out_dir, exist_ok=True) + options_pkl_file = '%s/options.pkl' % options.out_dir + options_pkl = open(options_pkl_file, 'wb') + pickle.dump(options, options_pkl) + options_pkl.close() + + # create base fold command + cmd_fold = '%s time hound_snp.py %s %s %s' % ( + cmd_base, options_pkl_file, params_file, model_file) + + for pi in range(options.processes): + sad_file = '%s/job%d/scores.h5' % (options.out_dir, pi) + if not nonzero_h5(sad_file, snp_stats): + cmd_job = '%s %s %d' % (cmd_fold, mpos_vcf_file, pi) + j = slurm.Job(cmd_job, '%s_pos%d' % (name,pi), + '%s/job%d.out' % (options.out_dir,pi), + '%s/job%d.err' % (options.out_dir,pi), + '%s/job%d.sb' % (options.out_dir,pi), + queue=options.queue, gpu=1, cpu=2, + mem=30000, time='7-0:0:0') + jobs.append(j) + + if run_hound: + slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True, + launch_sleep=10, update_sleep=60) + + ####################################################### + # collect output + + for ci in range(options.crosses): + for fi in range(options.num_folds): + it_out_dir = '%s/f%dc%d/%s' % (exp_dir, fi, ci, gtex_out_dir) + + # collect negatives + neg_out_dir = '%s/merge_neg' % it_out_dir + if not os.path.isfile('%s/scores.h5' % neg_out_dir): + collect_scores(neg_out_dir, options.processes) + + # collect positives + pos_out_dir = '%s/merge_pos' % it_out_dir + if not os.path.isfile('%s/scores.h5' % pos_out_dir): + collect_scores(pos_out_dir, options.processes) + + ################################################################ + # ensemble + + ensemble_dir = '%s/ensemble' % exp_dir + if not os.path.isdir(ensemble_dir): + os.mkdir(ensemble_dir) + + gtex_dir = '%s/%s' % (ensemble_dir, gtex_out_dir) + if not os.path.isdir(gtex_dir): + os.mkdir(gtex_dir) + + gtex_pos_vcf = '%s/merge_pos.vcf' % options.gtex_vcf_dir + gtex_neg_vcf = gtex_pos_vcf.replace('_pos.','_neg.') + pos_base = os.path.splitext(os.path.split(gtex_pos_vcf)[1])[0] + neg_base = os.path.splitext(os.path.split(gtex_neg_vcf)[1])[0] + + # collect SAD files + sad_pos_files = [] + sad_neg_files = [] + for ci in range(options.crosses): + for fi in range(options.num_folds): + it_dir = '%s/f%dc%d' % (exp_dir, fi, ci) + it_out_dir = '%s/%s' % (it_dir, gtex_out_dir) + + sad_pos_file = '%s/%s/scores.h5' % (it_out_dir, pos_base) + sad_pos_files.append(sad_pos_file) + + sad_neg_file = '%s/%s/scores.h5' % (it_out_dir, neg_base) + sad_neg_files.append(sad_neg_file) + + # ensemble + ens_pos_dir = '%s/%s' % (gtex_dir, pos_base) + os.makedirs(ens_pos_dir, exist_ok=True) + ens_pos_file = '%s/scores.h5' % (ens_pos_dir) + if not os.path.isfile(ens_pos_file): + ensemble_sad_h5(ens_pos_file, sad_pos_files) + + ens_neg_dir = '%s/%s' % (gtex_dir, neg_base) + os.makedirs(ens_neg_dir, exist_ok=True) + ens_neg_file = '%s/scores.h5' % (ens_neg_dir) + if not os.path.isfile(ens_neg_file): + ensemble_sad_h5(ens_neg_file, sad_neg_files) + + + ################################################################ + # fit classifiers + + cmd_base = 'westminster_classify.py -i 100 -r 44 -s' + cmd_base += ' --msl %d' % options.msl + + if options.class_targets_file is not None: + cmd_base += ' -t %s' % options.class_targets_file + + jobs = [] + for ci in range(options.crosses): + for fi in range(options.num_folds): + it_dir = '%s/f%dc%d' % (exp_dir, fi, ci) + it_out_dir = '%s/%s' % (it_dir, gtex_out_dir) + + tissue = "merge" + sad_pos = '%s/%s_pos/scores.h5' % (it_out_dir, tissue) + sad_neg = '%s/%s_neg/scores.h5' % (it_out_dir, tissue) + for snp_stat in snp_stats: + class_out_dir = '%s/%s_class-%s' % (it_out_dir, tissue, snp_stat) + if options.class_name is not None: + class_out_dir += '-%s' % options.class_name + if not os.path.isfile('%s/stats.txt' % class_out_dir): + cmd_class = '%s -o %s --stat %s' % (cmd_base, class_out_dir, snp_stat) + cmd_class += ' %s %s' % (sad_pos, sad_neg) + j = slurm.Job(cmd_class, tissue, + '%s.out'%class_out_dir, '%s.err'%class_out_dir, + queue='standard', cpu=2, + mem=22000, time='1-0:0:0') + jobs.append(j) + + # ensemble + tissue = "merge" + sad_pos = '%s/%s_pos/scores.h5' % (gtex_dir, tissue) + sad_neg = '%s/%s_neg/scores.h5' % (gtex_dir, tissue) + for snp_stat in snp_stats: + class_out_dir = '%s/%s_class-%s' % (gtex_dir, tissue, snp_stat) + if options.class_name is not None: + class_out_dir += '-%s' % options.class_name + if not os.path.isfile('%s/stats.txt' % class_out_dir): + cmd_class = '%s -o %s --stat %s' % (cmd_base, class_out_dir, snp_stat) + cmd_class += ' %s %s' % (sad_pos, sad_neg) + j = slurm.Job(cmd_class, tissue, + '%s.out'%class_out_dir, '%s.err'%class_out_dir, + queue='standard', cpu=2, + mem=22000, time='1-0:0:0') + jobs.append(j) + + slurm.multi_run(jobs, verbose=True) + + +def ensemble_sad_h5(ensemble_h5_file: str, scores_files): + """Ensemble SAD scores from multiple files into a single file. + + Args: + ensemble_h5_file (str): ensemble score HDF5. + scores_files ([str]): list of replicate score HDFs. + """ + # open ensemble + ensemble_h5 = h5py.File(ensemble_h5_file, 'w') + + # transfer base + base_keys = ['alt_allele','chr','pos','ref_allele','snp','target_ids','target_labels'] + snp_stats = [] + sad_shapes = [] + scores0_h5 = h5py.File(scores_files[0], 'r') + for key in scores0_h5.keys(): + if key in base_keys: + ensemble_h5.create_dataset(key, data=scores0_h5[key]) + else: + snp_stats.append(key) + sad_shapes.append(scores0_h5[key].shape) + scores0_h5.close() + + # average stats + num_folds = len(scores_files) + for si, snp_stat in enumerate(snp_stats): + # initialize ensemble array + sad_values = np.zeros(shape=sad_shapes[si], dtype='float32') + + # read and add folds + for scores_file in scores_files: + with h5py.File(scores_file, 'r') as scores_h5: + sad_values += scores_h5[snp_stat][:].astype('float32') + + # normalize and downcast + sad_values /= num_folds + sad_values = sad_values.astype('float16') + + # save + ensemble_h5.create_dataset(snp_stat, data=sad_values) + + ensemble_h5.close() + + +def split_sad(it_out_dir: str, posneg: str, vcf_dir: str, snp_stats): + """Split merged VCF predictions in HDF5 into tissue-specific + predictions in HDF5. + + Args: + it_out_dir (str): output directory for iteration. + posneg (str): 'pos' or 'neg'. + vcf_dir (str): directory containing tissue-specific VCFs. + snp_stats ([str]]): list of SAD stats. + """ + merge_h5_file = '%s/merge_%s/scores.h5' % (it_out_dir, posneg) + merge_h5 = h5py.File(merge_h5_file, 'r') + + # read merged data + snps = [snp.decode('UTF-8') for snp in merge_h5['snp']] + merge_scores = {} + for ss in snp_stats: + merge_scores[ss] = merge_h5[ss][:] + + # hash snp indexes + snp_si = dict(zip(snps, np.arange(len(snps)))) + + # for each tissue VCF + vcf_glob = '%s/*_%s.vcf' % (vcf_dir, posneg) + for tissue_vcf_file in glob.glob(vcf_glob): + tissue_label = tissue_vcf_file.split('/')[-1] + tissue_label = tissue_label.replace('_pos.vcf','') + tissue_label = tissue_label.replace('_neg.vcf','') + + # initialize HDF5 arrays + sad_snp = [] + sad_chr = [] + sad_pos = [] + sad_ref = [] + sad_alt = [] + sad_scores = {} + for ss in snp_stats: + sad_scores[ss] = [] + + # fill HDF5 arrays with ordered SNPs + for line in open(tissue_vcf_file): + if not line.startswith('#'): + a = line.split() + chrm, pos, snp, ref, alt = a[:5] + sad_snp.append(snp) + sad_chr.append(chrm) + sad_pos.append(int(pos)) + sad_ref.append(ref) + sad_alt.append(alt) + + for ss in snp_stats: + si = snp_si[snp] + sad_scores[ss].append(merge_scores[ss][si]) + + # write tissue HDF5 + tissue_dir = '%s/%s_%s' % (it_out_dir, tissue_label, posneg) + os.makedirs(tissue_dir, exist_ok=True) + with h5py.File('%s/scores.h5' % tissue_dir, 'w') as tissue_h5: + # write SNPs + tissue_h5.create_dataset('snp', + data=np.array(sad_snp, 'S')) + + # write SNP chr + tissue_h5.create_dataset('chr', + data=np.array(sad_chr, 'S')) + + # write SNP pos + tissue_h5.create_dataset('pos', + data=np.array(sad_pos, dtype='uint32')) + + # write ref allele + tissue_h5.create_dataset('ref_allele', + data=np.array(sad_ref, dtype='S')) + + # write alt allele + tissue_h5.create_dataset('alt_allele', + data=np.array(sad_alt, dtype='S')) + + # write targets + tissue_h5.create_dataset('target_ids', data=merge_h5['target_ids']) + tissue_h5.create_dataset('target_labels', data=merge_h5['target_labels']) + + # write sed stats + for ss in snp_stats: + tissue_h5.create_dataset(ss, + data=np.array(sad_scores[ss], dtype='float16')) + + merge_h5.close() + +################################################################################ +# __main__ +################################################################################ +if __name__ == '__main__': + main() diff --git a/scripts/westminster_gtex_folds.py b/scripts/westminster_gtex_folds.py index 9cc302c..0e03d79 100755 --- a/scripts/westminster_gtex_folds.py +++ b/scripts/westminster_gtex_folds.py @@ -56,6 +56,9 @@ def main(): snp_options.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') + snp_options.add_option("--stitch", dest="stitch", + default=False, action="store_true", + help=" [Default: %default]") snp_options.add_option('--stats', dest='snp_stats', default='logSAD', help='Comma-separated list of stats to save. [Default: %default]') diff --git a/scripts/westminster_gtexg_folds.py b/scripts/westminster_gtexg_folds.py index 269b305..194b750 100755 --- a/scripts/westminster_gtexg_folds.py +++ b/scripts/westminster_gtexg_folds.py @@ -59,6 +59,9 @@ def main(): sed_options.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') + snp_options.add_option("--stitch", dest="stitch", + default=False, action="store_true", + help=" [Default: %default]") sed_options.add_option('--span', dest='span', default=False, action='store_true', help='Aggregate entire gene span [Default: %default]')