outfactored mechanistic inference to its own module

baliga-lab · Aug 30, 2024 · f99a291 · f99a291
1 parent e06b8d6
commit f99a291
Show file tree

Hide file tree

Showing 2 changed files with 115 additions and 95 deletions.
diff --git a/bin/miner3-mechinf b/bin/miner3-mechinf
@@ -15,15 +15,13 @@ from pkg_resources import Requirement, resource_filename
 
 from miner import miner, util, GIT_SHA
 from miner import __version__ as MINER_VERSION
+from miner import mechinf
 
 
 DESCRIPTION = """miner3-mechinf - MINER compute mechanistic inference
 MINER Version %s (Git SHA %s)""" % (str(MINER_VERSION).replace('miner3 ', ''),
                                     GIT_SHA.replace('$Id: ', '').replace(' $', ''))
 
-NUM_CORES = 5
-MIN_REGULON_GENES = 5
-
 if __name__ == '__main__':
     logging.getLogger('matplotlib.font_manager').disabled = True
     LOG_FORMAT = '%(asctime)s %(message)s'
@@ -67,8 +65,6 @@ if __name__ == '__main__':
     with open(args.coexprdict) as infile:
         revised_clusters = json.load(infile)
 
-    # get first principal component axes of clusters
-    t1 = time.time()
     if args.tfs2genes is not None:
         database_path = args.tfs2genes
     else:
@@ -79,93 +75,8 @@ if __name__ == '__main__':
             # running from source
             database_path = os.path.join('miner', 'data', 'network_dictionaries', 'tfbsdb_tf_to_genes.pkl')
 
-    axes = miner.principal_df(revised_clusters, exp_data,
-                              subkey=None, minNumberGenes=1)
-
-    # analyze revised clusters for enrichment in relational database
-    # (default: transcription factor binding site database)
-    mechanistic_output = miner.mechanisticInference(axes, revised_clusters, exp_data,
-                                                    correlationThreshold=args.mincorr,
-                                                    numCores=NUM_CORES,
-                                                    database_path=database_path)
-
-    # write mechanistic output to .json file
-    with open(os.path.join(args.outdir, "mechanisticOutput.json"), 'w') as outfile:
-        json.dump(mechanistic_output, outfile)
-
-    # order mechanisticOutput as {tf:{coexpressionModule:genes}}
-    coregulation_modules = miner.getCoregulationModules(mechanistic_output)
-
-    # write coregulation modules to .json file
-    with open(os.path.join(args.outdir, "coregulationModules.json"), 'w') as outfile:
-        json.dump(coregulation_modules, outfile)
-
-    # get final regulons by keeping genes that requently appear coexpressed and associated
-    # to a common regulator
-    regulons = miner.getRegulons(coregulation_modules,
-                                 minNumberGenes=MIN_REGULON_GENES,
-                                 freqThreshold=0.333)
-
-    # reformat regulon dictionary for consistency with revisedClusters and coexpressionModules
-    regulon_modules, regulon_df = miner.regulonDictionary(regulons)
-
-    # FIRM export: note that we do not check whether we have RefSeq or Entrez, maybe this should be
-    # checked in the glue
-    entrez_map = miner.make_entrez_map(args.mapfile)
-    with open(os.path.join(args.outdir, args.firmout), 'w') as outfile:
-        outfile.write('Gene\tGroup\n')
-        for regulon, genes in regulon_modules.items():
-            for gene in genes:
-                if gene in entrez_map:
-                    outfile.write('%s\t%s\n' % (entrez_map[gene], regulon))
-
-    # OpenTargets export
-    with open(os.path.join(args.outdir, args.genelist), 'w') as outfile:
-        all_genes = set()
-        for regulon, genes in regulon_modules.items():
-            all_genes.update(genes)
-        for gene in sorted(all_genes):
-            outfile.write('%s\n' % gene)
-
-    # write regulons to json file
-    with open(os.path.join(args.outdir, "regulons.json"), 'w') as outfile:
-        json.dump(regulon_modules, outfile)
-    regulon_df.to_csv(os.path.join(args.outdir, "regulonDf.csv"))
-
-    # define coexpression modules as composite of coexpressed regulons
-    coexpression_modules = miner.getCoexpressionModules(mechanistic_output)
-
-    # write coexpression modules to .json file
-    with open(os.path.join(args.outdir, "coexpressionModules.json"), 'w') as outfile:
-        json.dump(coexpression_modules, outfile)
-
-    # write annotated coexpression clusters to .json file
-    with open(os.path.join(args.outdir, "coexpressionDictionary_annotated.json"), 'w') as outfile:
-        json.dump(revised_clusters, outfile)
-
-    """
-    # write annotated regulons to .json file
-    with open(os.path.join(args.outdir, "regulons_annotated.json"), 'w') as outfile:
-        json.dump(regulons, outfile)
-
-    # reconvert coexpression modules
-    annotated_coexpression_modules = mechanistic_inference.convert_dictionary(coexpression_modules, conv_table)
-
-    # write annotated coexpression modules to .json file
-    with open(os.path.join(args.outdir, "coexpressionModules_annotated.json"), 'w') as outfile:
-        json.dump(annotated_coexpression_modules, outfile)"""
-
-
-    # Get eigengenes for all modules
-    eigengenes = miner.getEigengenes(regulon_modules, exp_data, regulon_dict=None,
-                                     saveFolder=None)
-    eigen_scale = np.percentile(exp_data, 95) / np.percentile(eigengenes, 95)
-    eigengenes = eigen_scale * eigengenes
-    eigengenes.index = np.array(eigengenes.index).astype(str)
-
-    # write eigengenes to .csv
-    eigengenes.to_csv(os.path.join(args.outdir, "eigengenes.csv"))
-
-    t2 = time.time()
-    logging.info("Completed mechanistic inference in {:.2f} minutes".format((t2 - t1) / 60.))
-    logging.info("Inferred network with {:d} regulons, {:d} regulators, and {:d} co-regulated genes".format(len(regulon_df.Regulon_ID.unique()), len(regulon_df.Regulator.unique()),len(regulon_df.Gene.unique())))
+    mechinf.mechanistic_inference(exp_data, args.mapfile, revised_clusters,
+                                  database_path,
+                                  args.outdir,
+                                  args.mincorr, args.skip_tpm, args.firmout,
+                                  args.genelist)
diff --git a/miner/mechinf.py b/miner/mechinf.py
@@ -0,0 +1,109 @@
+"""mechinf.py - Mechanistic inference module"""
+
+import os
+import json
+import logging
+import time
+import numpy as np
+
+from miner import miner, util
+
+NUM_CORES = 5
+MIN_REGULON_GENES = 5
+
+
+def mechanistic_inference(exp_data, mapfile, revised_clusters, database_path,
+                          outdir, mincorr, skip_tpm, firmout, genelist):
+    # get first principal component axes of clusters
+    t1 = time.time()
+
+    axes = miner.principal_df(revised_clusters, exp_data,
+                              subkey=None, minNumberGenes=1)
+
+    # analyze revised clusters for enrichment in relational database
+    # (default: transcription factor binding site database)
+    mechanistic_output = miner.mechanisticInference(axes, revised_clusters, exp_data,
+                                                    correlationThreshold=mincorr,
+                                                    numCores=NUM_CORES,
+                                                    database_path=database_path)
+
+    # write mechanistic output to .json file
+    with open(os.path.join(outdir, "mechanisticOutput.json"), 'w') as outfile:
+        json.dump(mechanistic_output, outfile)
+
+    # order mechanisticOutput as {tf:{coexpressionModule:genes}}
+    coregulation_modules = miner.getCoregulationModules(mechanistic_output)
+
+    # write coregulation modules to .json file
+    with open(os.path.join(outdir, "coregulationModules.json"), 'w') as outfile:
+        json.dump(coregulation_modules, outfile)
+
+    # get final regulons by keeping genes that requently appear coexpressed and associated
+    # to a common regulator
+    regulons = miner.getRegulons(coregulation_modules,
+                                 minNumberGenes=MIN_REGULON_GENES,
+                                 freqThreshold=0.333)
+
+    # reformat regulon dictionary for consistency with revisedClusters and coexpressionModules
+    regulon_modules, regulon_df = miner.regulonDictionary(regulons)
+
+    # FIRM export: note that we do not check whether we have RefSeq or Entrez, maybe this should be
+    # checked in the glue
+    entrez_map = miner.make_entrez_map(mapfile)
+    with open(os.path.join(outdir, firmout), 'w') as outfile:
+        outfile.write('Gene\tGroup\n')
+        for regulon, genes in regulon_modules.items():
+            for gene in genes:
+                if gene in entrez_map:
+                    outfile.write('%s\t%s\n' % (entrez_map[gene], regulon))
+
+    # OpenTargets export
+    with open(os.path.join(outdir, genelist), 'w') as outfile:
+        all_genes = set()
+        for regulon, genes in regulon_modules.items():
+            all_genes.update(genes)
+        for gene in sorted(all_genes):
+            outfile.write('%s\n' % gene)
+
+    # write regulons to json file
+    with open(os.path.join(outdir, "regulons.json"), 'w') as outfile:
+        json.dump(regulon_modules, outfile)
+    regulon_df.to_csv(os.path.join(outdir, "regulonDf.csv"))
+
+    # define coexpression modules as composite of coexpressed regulons
+    coexpression_modules = miner.getCoexpressionModules(mechanistic_output)
+
+    # write coexpression modules to .json file
+    with open(os.path.join(outdir, "coexpressionModules.json"), 'w') as outfile:
+        json.dump(coexpression_modules, outfile)
+
+    # write annotated coexpression clusters to .json file
+    with open(os.path.join(outdir, "coexpressionDictionary_annotated.json"), 'w') as outfile:
+        json.dump(revised_clusters, outfile)
+
+    """
+    # write annotated regulons to .json file
+    with open(os.path.join(outdir, "regulons_annotated.json"), 'w') as outfile:
+        json.dump(regulons, outfile)
+
+    # reconvert coexpression modules
+    annotated_coexpression_modules = mechanistic_inference.convert_dictionary(coexpression_modules, conv_table)
+
+    # write annotated coexpression modules to .json file
+    with open(os.path.join(outdir, "coexpressionModules_annotated.json"), 'w') as outfile:
+        json.dump(annotated_coexpression_modules, outfile)"""
+
+
+    # Get eigengenes for all modules
+    eigengenes = miner.getEigengenes(regulon_modules, exp_data, regulon_dict=None,
+                                     saveFolder=None)
+    eigen_scale = np.percentile(exp_data, 95) / np.percentile(eigengenes, 95)
+    eigengenes = eigen_scale * eigengenes
+    eigengenes.index = np.array(eigengenes.index).astype(str)
+
+    # write eigengenes to .csv
+    eigengenes.to_csv(os.path.join(outdir, "eigengenes.csv"))
+
+    t2 = time.time()
+    logging.info("Completed mechanistic inference in {:.2f} minutes".format((t2 - t1) / 60.))
+    logging.info("Inferred network with {:d} regulons, {:d} regulators, and {:d} co-regulated genes".format(len(regulon_df.Regulon_ID.unique()), len(regulon_df.Regulator.unique()),len(regulon_df.Gene.unique())))