Skip to content

Commit

Permalink
snakefile update
Browse files Browse the repository at this point in the history
  • Loading branch information
grexor committed Dec 16, 2024
1 parent b4048c7 commit d398ed4
Showing 1 changed file with 171 additions and 93 deletions.
264 changes: 171 additions & 93 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pandas as pd
import splicekit

container: "docker://ghcr.io/bedapub/splicekit:main"
available_threads = workflow.cores
Expand All @@ -17,90 +18,12 @@ if os.path.exists("splicekit.config"):

genome_version = f"--genome_version {genome_version}" if genome_version else ""

def make_short_names(text):
result = text
try:
short_names
except:
return result
for from_text, to_text, replace_type in short_names:
if replace_type=="complete":
if text==from_text:
result = to_text
else:
result = text.replace(from_text, to_text)
return result
if not os.path.exists("annotation/comparisons.tab"):
splicekit.annotation()

def make_comparisons():
comparisons = []
comparisons_ids = []
treatments = {}
samples = set()
f = open("samples.tab")
r = f.readline()
while r.startswith("#"):
r = f.readline()
header = r.replace("\r", "").replace("\n", "").split("\t")
r = f.readline()
separates = set()
while r:
if r.startswith("#"):
r = f.readline()
continue
r = r.replace("\r", "").replace("\n", "").split("\t")
data = dict(zip(header, r))
sample_id = data[sample_column]
samples.add(sample_id)
treatment_id = data[treatment_column]
separates.add(data.get(separate_column, ""))
treatments.setdefault(treatment_id, []).append((sample_id, treatment_id, data.get(separate_column, ""), data.get(group_column, "")))
r = f.readline()
f.close()
for treatment, data in treatments.items(): # sort by sample ID
try:
treatments[treatment] = sort_readout_id(data)
except:
pass
dmso_hash = {}
dmso_letter = "A"
separates = sorted(separates) # sometimes the separates set was reshufled, very difficult to pinpoint/debug this
for sep in separates:
for test_sample, test_data in treatments.items():
if test_sample == control_name:
continue
temp_test = [(a,b,c,d) for (a,b,c,d) in test_data if c==sep]
group_test = set([d for (a,b,c,d) in temp_test])
if len(temp_test)==0:
continue
for control_sample, control_data in treatments.items():
if control_sample != control_name:
continue
temp_control = [(a,b,c,d) for (a,b,c,d) in control_data if c==sep]
if group_column!="":
temp_control = [(a,b,c,d) for (a,b,c,d) in temp_control if d in group_test]
if len(temp_control)==0:
continue
# add comparison to the list
# also add a unique control_group_id name, not only with _sep
# this is then useful for the contrast and design matrix for DGE analysis
if dmso_hash.get(tuple(temp_control), None)==None:
dmso_hash[tuple(temp_control)] = dmso_letter
dmso_letter = chr(ord(dmso_letter) + 1)
sep_temp = sep.replace(" ", "_").lower()
if sep_temp!="":
test_group_id = f"{test_sample}_{sep_temp}"
control_group_id = f"{control_sample}_{dmso_hash[tuple(temp_control)]}_{sep_temp}"
comparison_name = f"{test_sample}_{control_sample}{dmso_hash[tuple(temp_control)]}_{sep_temp}"
else:
test_group_id = f"{test_sample}"
control_group_id = f"{control_sample}{dmso_hash[tuple(temp_control)]}"
comparison_name = f"{test_sample}_{control_sample}{dmso_hash[tuple(temp_control)]}"
comparisons.append((make_short_names(comparison_name), temp_control, temp_test, make_short_names(control_group_id), make_short_names(test_group_id)))
comparisons_ids.append(make_short_names(comparison_name))
return comparisons, comparisons_ids
comparisons_df = pd.read_csv("annotation/comparisons.tab", sep="\t", comment="#")
COMPARISONS = comparisons_df["comparison"].tolist()

comparisons, comparisons_ids = make_comparisons()

rule all:
input:
# annotation
Expand Down Expand Up @@ -139,8 +62,8 @@ rule all:
"data/samples_genes_counts.tab.gz",

# edgeR
"results/edgeR/junctions_results_complete.tab.gz",
"results/edgeR/junctions_results_fdr005.tab.gz"
expand("results/edgeR/{feature_type}_results_complete.tab.gz", feature_type=["junctions", "exons", "donor_anchors", "acceptor_anchors", "genes"]),
expand("results/edgeR/{feature_type}_results_fdr005.tab.gz", feature_type=["junctions", "exons", "donor_anchors", "acceptor_anchors", "genes"])

rule setup:
input:
Expand Down Expand Up @@ -358,19 +281,174 @@ rule genes_count:

rule edgeR:
input:
"annotation/comparison_{comparison}.tab"
"data/samples_{feature_type}_counts.tab.gz"
output:
"results/edgeR/junctions/{comparison}_altsplice.tab.gz",
"results/edgeR/junctions/{comparison}_difffeature.tab.gz"
"results/edgeR/{feature_type}/{comparison}_altsplice.tab.gz",
"results/edgeR/{feature_type}/{comparison}_difffeature.tab.gz",
wildcard_constraints:
feature_type="junctions"
run:
print("ok1")
job_sh_edgeR="""R --no-save --args {input_folder} {atype} {control_name} {test_name} {comparison_name} {sample_membership} {filter_low} < {core_path}/comps_edgeR.R"""
comparison = comps[wildcards.comparison]
comparison_name, control_set, test_set, control_group_id, test_group_id = comps[wildcards.comparison]
control_name = control_group_id
test_name = test_group_id
control_ids = []
test_ids = []
for (sample_id, compound, rep, _) in control_set:
control_ids.append(sample_id)
for (sample_id, compound, rep, _) in test_set:
test_ids.append(sample_id)
try:
control_ids = sort_readout_id(control_ids)
except:
pass
try:
test_ids = sort_readout_id(test_ids)
except:
pass

try:
filter_low
except:
filter_low = "filter_low"

job_sh_junctions = job_sh_edgeR.format(filter_low=filter_low, core_path=os.path.dirname(splicekit.core.__file__), comparison_name=comparison_name, input_folder=os.getcwd(), atype=wildcards.feature_type, control_name=control_name, test_name=test_name, sample_membership=",".join(str(el) for el in sample_membership))
if workflow.use_singularity:
job_sh_junctions = f"{container} {job_sh_junctions}"
shell(job_sh_junctions)

rule edgeR_compile:
input:
expand("results/edgeR/junctions/{comparison}_altsplice.tab.gz", comparison=comparisons_ids),
expand("results/edgeR/junctions/{comparison}_difffeature.tab.gz", comparison=comparisons_ids)
expand("results/edgeR/{feature_type}/{comparison}_altsplice.tab.gz", comparison=COMPARISONS, feature_type=["junctions", "exons", "donor_anchors", "acceptor_anchors", "genes"]),
expand("results/edgeR/{feature_type}/{comparison}_difffeature.tab.gz", comparison=COMPARISONS, feature_type=["junctions", "exons", "donor_anchors", "acceptor_anchors", "genes"])
output:
"results/edgeR/junctions_results_complete.tab.gz",
"results/edgeR/junctions_results_fdr005.tab.gz",
"results/edgeR/{feature_type}_results_complete.tab.gz",
"results/edgeR/{feature_type}_results_fdr005.tab.gz"
wildcard_constraints:
feature_type="junctions"
run:
print("ok2")
import splicekit
splicekit.core.report.edgeR_feature(wildcards.feature_type)

"""
# alpha numeric sort
def break_readout_id(item):
if type(item) is tuple:
item_process = item[0]
else:
item_process = item
for splitter in [" ", "_", "-"]:
item_temp = item_process.split(splitter)
# separates the string by splitter
# find first element that casts to int and return (int, str_without_element)
if len(item_temp)>1:
for item_index_test in range(0, len(item_temp)):
if item_temp[item_index_test].isdigit():
return (int(item_temp[item_index_test]), splitter.join(item_temp[:item_index_test]+item_temp[item_index_test+1:]))
if item_process.isdigit():
return int(item_process)
else:
return item_process
def sort_readout_id(data):
return sorted(data, key=break_readout_id)
def make_short_names(text):
result = text
try:
short_names
except:
return result
for from_text, to_text, replace_type in short_names:
if replace_type=="complete":
if text==from_text:
result = to_text
else:
result = text.replace(from_text, to_text)
return result
def make_comparisons():
comparisons = []
comps = {}
comparisons_ids = []
treatments = {}
samples = set()
f = open("samples.tab")
r = f.readline()
while r.startswith("#"):
r = f.readline()
header = r.replace("\r", "").replace("\n", "").split("\t")
r = f.readline()
separates = set()
while r:
if r.startswith("#"):
r = f.readline()
continue
r = r.replace("\r", "").replace("\n", "").split("\t")
data = dict(zip(header, r))
sample_id = data[sample_column]
samples.add(sample_id)
treatment_id = data[treatment_column]
separates.add(data.get(separate_column, ""))
treatments.setdefault(treatment_id, []).append((sample_id, treatment_id, data.get(separate_column, ""), data.get(group_column, "")))
r = f.readline()
f.close()
for treatment, data in treatments.items(): # sort by sample ID
try:
treatments[treatment] = sort_readout_id(data)
except:
pass
dmso_hash = {}
dmso_letter = "A"
separates = sorted(separates) # sometimes the separates set was reshufled, very difficult to pinpoint/debug this
for sep in separates:
for test_sample, test_data in treatments.items():
if test_sample == control_name:
continue
temp_test = [(a,b,c,d) for (a,b,c,d) in test_data if c==sep]
group_test = set([d for (a,b,c,d) in temp_test])
if len(temp_test)==0:
continue
for control_sample, control_data in treatments.items():
if control_sample != control_name:
continue
temp_control = [(a,b,c,d) for (a,b,c,d) in control_data if c==sep]
if group_column!="":
temp_control = [(a,b,c,d) for (a,b,c,d) in temp_control if d in group_test]
if len(temp_control)==0:
continue
# add comparison to the list
# also add a unique control_group_id name, not only with _sep
# this is then useful for the contrast and design matrix for DGE analysis
if dmso_hash.get(tuple(temp_control), None)==None:
dmso_hash[tuple(temp_control)] = dmso_letter
dmso_letter = chr(ord(dmso_letter) + 1)
sep_temp = sep.replace(" ", "_").lower()
if sep_temp!="":
test_group_id = f"{test_sample}_{sep_temp}"
control_group_id = f"{control_sample}_{dmso_hash[tuple(temp_control)]}_{sep_temp}"
comparison_name = f"{test_sample}_{control_sample}{dmso_hash[tuple(temp_control)]}_{sep_temp}"
else:
test_group_id = f"{test_sample}"
control_group_id = f"{control_sample}{dmso_hash[tuple(temp_control)]}"
comparison_name = f"{test_sample}_{control_sample}{dmso_hash[tuple(temp_control)]}"
comparisons.append((make_short_names(comparison_name), temp_control, temp_test, make_short_names(control_group_id), make_short_names(test_group_id)))
comps[comparisons[-1][0]] = comparisons[-1]
comparisons_ids.append(make_short_names(comparison_name))
return comps, comparisons, comparisons_ids
comps, comparisons, comparisons_ids = make_comparisons()
sample_membership = {}
for (comparison_name, control_set, test_set, control_group_id, test_group_id) in comparisons:
for (sample_id, _, _, _) in control_set:
# in some rare cases, the same sample can be part of diverse control groups
sample_membership[sample_id] = control_group_id
for (sample_id, _, _, _) in test_set:
# in some rare cases, the same sample can be part of diverse test groups
sample_membership[sample_id] = test_group_id
sample_membership = [sample_membership[sample_id] for sample_id in SAMPLES]
"""

0 comments on commit d398ed4

Please sign in to comment.