Skip to content

Commit

Permalink
outfactored subtypes workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
weiju committed Aug 30, 2024
1 parent c4260d5 commit 318568e
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 76 deletions.
79 changes: 3 additions & 76 deletions bin/miner3-subtypes
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import logging

from miner import miner, util
from miner import GIT_SHA, __version__ as pkg_version

import logging
from miner import subtypes


DESCRIPTION = """miner3-subtypes - MINER compute sample subtypes
Expand Down Expand Up @@ -53,80 +53,7 @@ if __name__ == '__main__':

LOGGER.info('load and setup data')
exp_data, conv_table = miner.preprocess(args.expfile, args.mapfile, do_preprocess_tpm=(not args.skip_tpm))
bkgd = miner.backgroundDf(exp_data)

with open(args.regulons) as infile:
regulon_modules = json.load(infile)

overexpressed_members = miner.biclusterMembershipDictionary(regulon_modules, bkgd, label=2,
p=0.05)
overexpressed_members_matrix = miner.membershipToIncidence(overexpressed_members,
exp_data)
underexpressed_members = miner.biclusterMembershipDictionary(regulon_modules, bkgd, label=0,
p=0.05)
underexpressed_members_matrix = miner.membershipToIncidence(underexpressed_members,
exp_data)

sample_dictionary = overexpressed_members
sample_matrix = overexpressed_members_matrix

# Matt's new way of subtype discovery (START)
# TODO: fold this into the code above
reference_matrix = overexpressed_members_matrix - underexpressed_members_matrix
primary_matrix = overexpressed_members_matrix
primary_dictionary = overexpressed_members
secondary_matrix = underexpressed_members_matrix
secondary_dictionary = underexpressed_members

states, centroid_clusters = miner.inferSubtypes(reference_matrix, primary_matrix,
secondary_matrix,
primary_dictionary,
secondary_dictionary,
minClusterSize=int(np.ceil(0.01*exp_data.shape[1])),restricted_index=None)
states_dictionary = {str(i):states[i] for i in range(len(states))}
with open(os.path.join(args.outdir, "transcriptional_states.json"), 'w') as outfile:
json.dump(states_dictionary, outfile)

eigengenes = miner.getEigengenes(regulon_modules, exp_data,
regulon_dict=None,saveFolder=None)
eigenScale = np.percentile(exp_data, 95) / np.percentile(eigengenes, 95)
eigengenes = eigenScale * eigengenes
eigengenes.index = np.array(eigengenes.index).astype(str)
eigengenes.to_csv(os.path.join(args.outdir, "eigengenes.csv"))
reference_df = eigengenes.copy()
programs, _ = miner.mosaic(dfr=reference_df, clusterList=centroid_clusters,
minClusterSize_x=int(np.ceil(0.01*exp_data.shape[1])),
minClusterSize_y=5,
allow_singletons=False,
max_groups=50,
saveFile=os.path.join(args.outdir,"regulon_activity_heatmap.pdf"),
random_state=12)
transcriptional_programs, program_regulons = miner.transcriptionalPrograms(programs,
regulon_modules)
program_list = [program_regulons[("").join(["TP",str(i)])] for i in range(len(program_regulons))]
programs_dictionary = {str(i):program_list[i] for i in range(len(program_list))}

with open(os.path.join(args.outdir, "transcriptional_programs.json"), 'w') as outfile:
json.dump(programs_dictionary, outfile)

mosaic_df = reference_df.loc[np.hstack(program_list), np.hstack(states)]
mosaic_df.to_csv(os.path.join(args.outdir, "regulons_activity_heatmap.csv"))

dfr = overexpressed_members_matrix - underexpressed_members_matrix
mtrx = dfr.loc[np.hstack(program_list),np.hstack(states)]
plt.figure(figsize=(8,8))
plt.imshow(mtrx, cmap="bwr", vmin=-1, vmax=1, aspect=float(mtrx.shape[1]) / float(mtrx.shape[0]))
plt.grid(False)
plt.savefig(os.path.join(args.outdir, "mosaic_all.pdf"), bbox_inches="tight")

# Determine activity of transcriptional programs in each sample
states_df = miner.reduceModules(df=dfr.loc[np.hstack(program_list),
np.hstack(states)],programs=program_list,
states=states, stateThreshold=0.50,
saveFile=os.path.join(args.outdir, "transcriptional_programs.pdf"))

# Cluster patients into subtypes and give the activity of each program in each subtype
programsVsStates = miner.programsVsStates(states_df, states,
filename=os.path.join(args.outdir, "programs_vs_states.pdf"),
csvpath=os.path.join(args.outdir, "programs_vs_states.csv"),
showplot=True)
subtypes.subtypes(exp_data, regulon_modules, args.outdir)
84 changes: 84 additions & 0 deletions miner/subtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""subtypes.py - subtypes computation module"""

import os
import json
import numpy as np
import matplotlib.pyplot as plt

from miner import miner

def subtypes(exp_data, regulon_modules, outdir):
bkgd = miner.background_df(exp_data)

overexpressed_members = miner.biclusterMembershipDictionary(regulon_modules, bkgd, label=2,
p=0.05)
overexpressed_members_matrix = miner.membershipToIncidence(overexpressed_members,
exp_data)
underexpressed_members = miner.biclusterMembershipDictionary(regulon_modules, bkgd, label=0,
p=0.05)
underexpressed_members_matrix = miner.membershipToIncidence(underexpressed_members,
exp_data)

sample_dictionary = overexpressed_members
sample_matrix = overexpressed_members_matrix

# Matt's new way of subtype discovery (START)
# TODO: fold this into the code above
reference_matrix = overexpressed_members_matrix - underexpressed_members_matrix
primary_matrix = overexpressed_members_matrix
primary_dictionary = overexpressed_members
secondary_matrix = underexpressed_members_matrix
secondary_dictionary = underexpressed_members

states, centroid_clusters = miner.inferSubtypes(reference_matrix, primary_matrix,
secondary_matrix,
primary_dictionary,
secondary_dictionary,
minClusterSize=int(np.ceil(0.01*exp_data.shape[1])),restricted_index=None)
states_dictionary = {str(i):states[i] for i in range(len(states))}
with open(os.path.join(outdir, "transcriptional_states.json"), 'w') as outfile:
json.dump(states_dictionary, outfile)

eigengenes = miner.getEigengenes(regulon_modules, exp_data,
regulon_dict=None,saveFolder=None)
eigenScale = np.percentile(exp_data, 95) / np.percentile(eigengenes, 95)
eigengenes = eigenScale * eigengenes
eigengenes.index = np.array(eigengenes.index).astype(str)
eigengenes.to_csv(os.path.join(outdir, "eigengenes.csv"))
reference_df = eigengenes.copy()
programs, _ = miner.mosaic(dfr=reference_df, clusterList=centroid_clusters,
minClusterSize_x=int(np.ceil(0.01*exp_data.shape[1])),
minClusterSize_y=5,
allow_singletons=False,
max_groups=50,
saveFile=os.path.join(outdir,"regulon_activity_heatmap.pdf"),
random_state=12)
transcriptional_programs, program_regulons = miner.transcriptionalPrograms(programs,
regulon_modules)
program_list = [program_regulons[("").join(["TP",str(i)])] for i in range(len(program_regulons))]
programs_dictionary = {str(i):program_list[i] for i in range(len(program_list))}

with open(os.path.join(outdir, "transcriptional_programs.json"), 'w') as outfile:
json.dump(programs_dictionary, outfile)

mosaic_df = reference_df.loc[np.hstack(program_list), np.hstack(states)]
mosaic_df.to_csv(os.path.join(outdir, "regulons_activity_heatmap.csv"))

dfr = overexpressed_members_matrix - underexpressed_members_matrix
mtrx = dfr.loc[np.hstack(program_list),np.hstack(states)]
plt.figure(figsize=(8,8))
plt.imshow(mtrx, cmap="bwr", vmin=-1, vmax=1, aspect=float(mtrx.shape[1]) / float(mtrx.shape[0]))
plt.grid(False)
plt.savefig(os.path.join(outdir, "mosaic_all.pdf"), bbox_inches="tight")

# Determine activity of transcriptional programs in each sample
states_df = miner.reduceModules(df=dfr.loc[np.hstack(program_list),
np.hstack(states)],programs=program_list,
states=states, stateThreshold=0.50,
saveFile=os.path.join(outdir, "transcriptional_programs.pdf"))

# Cluster patients into subtypes and give the activity of each program in each subtype
programsVsStates = miner.programsVsStates(states_df, states,
filename=os.path.join(outdir, "programs_vs_states.pdf"),
csvpath=os.path.join(outdir, "programs_vs_states.csv"),
showplot=True)

0 comments on commit 318568e

Please sign in to comment.