Skip to content

Commit

Permalink
comps generated in Snakefile
Browse files Browse the repository at this point in the history
  • Loading branch information
grexor committed Nov 29, 2024
1 parent c7bd12c commit 896a92d
Show file tree
Hide file tree
Showing 2 changed files with 275 additions and 5 deletions.
279 changes: 274 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import os
import pandas as pd

container: "docker://ghcr.io/bedapub/splicekit:main"
threads = config.get("threads", 1)
available_threads = workflow.cores

samples_df = pd.read_csv("samples.tab", sep="\t", comment="#")
SAMPLES = samples_df["sample_id"].tolist()
Expand All @@ -17,23 +17,144 @@ if os.path.exists("splicekit.config"):

genome_version = f"--genome_version {genome_version}" if genome_version else ""

def make_short_names(text):
result = text
try:
short_names
except:
return result
for from_text, to_text, replace_type in short_names:
if replace_type=="complete":
if text==from_text:
result = to_text
else:
result = text.replace(from_text, to_text)
return result

def make_comparisons():
comparisons = []
comparisons_ids = []
treatments = {}
samples = set()
f = open("samples.tab")
r = f.readline()
while r.startswith("#"):
r = f.readline()
header = r.replace("\r", "").replace("\n", "").split("\t")
r = f.readline()
separates = set()
while r:
if r.startswith("#"):
r = f.readline()
continue
r = r.replace("\r", "").replace("\n", "").split("\t")
data = dict(zip(header, r))
sample_id = data[sample_column]
samples.add(sample_id)
treatment_id = data[treatment_column]
separates.add(data.get(separate_column, ""))
treatments.setdefault(treatment_id, []).append((sample_id, treatment_id, data.get(separate_column, ""), data.get(group_column, "")))
r = f.readline()
f.close()
for treatment, data in treatments.items(): # sort by sample ID
try:
treatments[treatment] = sort_readout_id(data)
except:
pass
dmso_hash = {}
dmso_letter = "A"
separates = sorted(separates) # sometimes the separates set was reshufled, very difficult to pinpoint/debug this
for sep in separates:
for test_sample, test_data in treatments.items():
if test_sample == control_name:
continue
temp_test = [(a,b,c,d) for (a,b,c,d) in test_data if c==sep]
group_test = set([d for (a,b,c,d) in temp_test])
if len(temp_test)==0:
continue
for control_sample, control_data in treatments.items():
if control_sample != control_name:
continue
temp_control = [(a,b,c,d) for (a,b,c,d) in control_data if c==sep]
if group_column!="":
temp_control = [(a,b,c,d) for (a,b,c,d) in temp_control if d in group_test]
if len(temp_control)==0:
continue
# add comparison to the list
# also add a unique control_group_id name, not only with _sep
# this is then useful for the contrast and design matrix for DGE analysis
if dmso_hash.get(tuple(temp_control), None)==None:
dmso_hash[tuple(temp_control)] = dmso_letter
dmso_letter = chr(ord(dmso_letter) + 1)
sep_temp = sep.replace(" ", "_").lower()
if sep_temp!="":
test_group_id = f"{test_sample}_{sep_temp}"
control_group_id = f"{control_sample}_{dmso_hash[tuple(temp_control)]}_{sep_temp}"
comparison_name = f"{test_sample}_{control_sample}{dmso_hash[tuple(temp_control)]}_{sep_temp}"
else:
test_group_id = f"{test_sample}"
control_group_id = f"{control_sample}{dmso_hash[tuple(temp_control)]}"
comparison_name = f"{test_sample}_{control_sample}{dmso_hash[tuple(temp_control)]}"
comparisons.append((make_short_names(comparison_name), temp_control, temp_test, make_short_names(control_group_id), make_short_names(test_group_id)))
comparisons_ids.append(make_short_names(comparison_name))
return comparisons, comparisons_ids

comparisons, comparisons_ids = make_comparisons()

rule all:
input:
# annotation
"annotation/comparisons.tab",

# bam files
expand("bam/{sample}.bam", sample=SAMPLES),

# sample junction counts from bam files
expand("data/sample_junctions_data/sample_{sample}_raw.tab.gz", sample=SAMPLES),

# GTF files
"reference/junctions.tab.gz",
"reference/acceptor_anchors.gtf.gz",
"reference/donor_anchors.gtf.gz",
"reference/exons.gtf.gz",
"reference/genes.gtf.gz",

# anchors
expand("data/sample_acceptor_anchors_data/sample_{sample}.tab.gz", sample=SAMPLES),
expand("logs/count_acceptor_anchors/sample_{sample}.tab.summary", sample=SAMPLES),
expand("data/sample_donor_anchors_data/sample_{sample}.tab.gz", sample=SAMPLES),
expand("logs/count_donor_anchors/sample_{sample}.tab.summary", sample=SAMPLES)
# anchor counts
"data/samples_acceptor_anchors_counts.tab.gz",
"data/samples_donor_anchors_counts.tab.gz",

# exons counts
expand("data/sample_exons_data/sample_{sample}.tab.gz", sample=SAMPLES),
"data/samples_exons_counts.tab.gz",

# junctions counts
expand("data/sample_junctions_data/sample_{sample}.tab.gz", sample=SAMPLES),
"data/samples_junctions_counts.tab.gz",

# genes counts
expand("data/sample_genes_data/sample_{sample}.tab.gz", sample=SAMPLES),
"data/samples_genes_counts.tab.gz",

# edgeR
"results/edgeR/junctions_results_complete.tab.gz",
"results/edgeR/junctions_results_fdr005.tab.gz"

rule setup:
input:
"samples.tab"
output:
"annotation/comparisons.tab"
shell:
"""
splicekit setup
splicekit annotation
"""

rule map_fastq:
threads: available_threads
input:
fastq1="fastq/{sample}_1.fastq.gz",
fastq2="fastq/{sample}_2.fastq.gz"
Expand All @@ -44,8 +165,8 @@ rule map_fastq:
echo mapping {{input.fastq1}} {{input.fastq2}} {{output}}
echo species = {species}
echo genome_version = {genome_version}
echo pybio star {species} {{input.fastq1}} {{input.fastq2}} {{output}} -t {threads} {genome_version}
pybio star {species} {{input.fastq1}} {{input.fastq2}} {{output}} -t {threads} {genome_version}
echo pybio star {species} {{input.fastq1}} {{input.fastq2}} {{output}} -t {{threads}} {genome_version}
pybio star {species} {{input.fastq1}} {{input.fastq2}} {{output}} -t {{threads}} {genome_version}
"""

rule junctions:
Expand All @@ -60,6 +181,17 @@ rule junctions:
python /usr/splicekit/splicekit/core/junctions.py {input} {params.fname}
"""

rule junctions_per_sample:
input:
expand("data/sample_junctions_data/sample_{sample}_raw.tab.gz", sample=SAMPLES),
"reference/junctions.tab.gz"
output:
expand("data/sample_junctions_data/sample_{sample}.tab.gz", sample=SAMPLES),
run:
import splicekit
splicekit.core.annotation.make_comparisons()
splicekit.core.junctions.junctions_per_sample()

rule junctions_make_master:
input:
expand("data/sample_junctions_data/sample_{sample}_raw.tab.gz", sample=SAMPLES)
Expand All @@ -82,6 +214,22 @@ rule anchors_gtf:
python -c 'import splicekit; splicekit.core.anchors.write_anchor_gtf();'
"""

rule exons_gtf:
output:
"reference/exons.gtf.gz",
shell:
"""
python -c 'import splicekit; splicekit.core.exons.write_exons_gtf()'
"""

rule genes_gtf:
output:
"reference/genes.gtf.gz",
shell:
"""
python -c 'import splicekit; splicekit.core.genes.write_genes_gtf()'
"""

rule anchors:
params:
library_type_insert = {"single-end":"", "paired-end":"-p "}[library_type],
Expand All @@ -105,3 +253,124 @@ rule anchors:
mv {params.tab_fname}.summary {params.logs_dir}
gzip -f {params.tab_fname}
"""

rule exons:
params:
library_type_insert = {"single-end":"", "paired-end":"-p "}[library_type],
library_strand_insert = {"FIRST_READ_TRANSCRIPTION_STRAND":1, "SINGLE_STRAND":1, "SINGLE_REVERSE":1, "SECOND_READ_TRANSCRIPTION_STRAND":2, "NONE":0}[library_strand],
tab_fname = lambda wildcards: f"data/sample_exons_data/sample_{wildcards.sample}.tab",
logs_dir = lambda wildcards: f'logs/count_exons'
input:
gtf_fname = "reference/exons.gtf.gz",
bam_fname = "bam/{sample}.bam"
output:
"data/sample_exons_data/sample_{sample}.tab.gz"
shell:
"""
featureCounts {params.library_type_insert}-s {params.library_strand_insert} -M -O -T {threads} -F GTF -f -t exon -g exon_id -a {input.gtf_fname} -o {params.tab_fname} {input.bam_fname}
cp {params.tab_fname} {params.tab_fname}_temp
echo "anchor_id\tcount" >| {params.tab_fname}
tail -n +3 {params.tab_fname}_temp| cut -f1,7 >> {params.tab_fname}
rm {params.tab_fname}_temp
mv {params.tab_fname}.summary {params.logs_dir}
gzip -f {params.tab_fname}
"""

rule genes:
params:
library_type_insert = {"single-end":"", "paired-end":"-p "}[library_type],
library_strand_insert = {"FIRST_READ_TRANSCRIPTION_STRAND":1, "SINGLE_STRAND":1, "SINGLE_REVERSE":1, "SECOND_READ_TRANSCRIPTION_STRAND":2, "NONE":0}[library_strand],
tab_fname = lambda wildcards: f"data/sample_genes_data/sample_{wildcards.sample}.tab",
logs_dir = lambda wildcards: f'logs/count_genes'
input:
gtf_fname = "reference/genes.gtf.gz",
bam_fname = "bam/{sample}.bam"
output:
"data/sample_genes_data/sample_{sample}.tab.gz"
shell:
"""
featureCounts {params.library_type_insert}-s {params.library_strand_insert} -M -O -T {threads} -F GTF -f -t exon -g gene_id -a {input.gtf_fname} -o {params.tab_fname} {input.bam_fname}
cp {params.tab_fname} {params.tab_fname}_temp
echo "anchor_id\tcount" >| {params.tab_fname}
tail -n +3 {params.tab_fname}_temp| cut -f1,7 >> {params.tab_fname}
rm {params.tab_fname}_temp
mv {params.tab_fname}.summary {params.logs_dir}
gzip -f {params.tab_fname}
"""

rule anchors_counts:
input:
expand("data/sample_donor_anchors_data/sample_{sample}.tab.gz", sample=SAMPLES),
expand("data/sample_acceptor_anchors_data/sample_{sample}.tab.gz", sample=SAMPLES),
"reference/junctions.tab.gz"
output:
"data/samples_acceptor_anchors_counts.tab.gz",
"data/samples_donor_anchors_counts.tab.gz"
run:
import splicekit
splicekit.core.annotation.make_comparisons()
splicekit.core.features.load_genes()
splicekit.core.features.read_anchors("donor")
splicekit.core.features.read_anchors("acceptor")
os.system("rm -f data/comparison_donor_anchors_data/*.tab.gz > /dev/null 2>&1")
os.system("rm -f data/comparison_acceptor_anchors_data/*.tab.gz > /dev/null 2>&1")
splicekit.core.features.make_counts_table("donor_anchors")
splicekit.core.features.make_counts_table("acceptor_anchors")

rule junctions_count:
input:
expand("data/sample_junctions_data/sample_{sample}.tab.gz", sample=SAMPLES),
"reference/junctions.tab.gz"
output:
"data/samples_junctions_counts.tab.gz"
run:
import splicekit
splicekit.core.annotation.make_comparisons()
splicekit.core.junctions.junctions_per_sample()
splicekit.core.features.load_genes()
splicekit.core.features.read_junctions()
splicekit.core.features.make_counts_table("junctions")

rule exons_count:
input:
expand("data/sample_exons_data/sample_{sample}.tab.gz", sample=SAMPLES),
"reference/junctions.tab.gz"
output:
"data/samples_exons_counts.tab.gz"
run:
import splicekit
splicekit.core.annotation.make_comparisons()
splicekit.core.features.load_genes()
splicekit.core.features.read_exons()
splicekit.core.features.make_counts_table("exons")

rule genes_count:
input:
expand("data/sample_genes_data/sample_{sample}.tab.gz", sample=SAMPLES),
output:
"data/samples_genes_counts.tab.gz"
run:
import splicekit
splicekit.core.annotation.make_comparisons()
splicekit.core.features.load_genes()
splicekit.core.features.read_genes()
splicekit.core.features.make_counts_table("genes")

rule edgeR:
input:
"annotation/comparison_{comparison}.tab"
output:
"results/edgeR/junctions/{comparison}_altsplice.tab.gz",
"results/edgeR/junctions/{comparison}_difffeature.tab.gz"
run:
print("ok1")

rule edgeR_compile:
input:
expand("results/edgeR/junctions/{comparison}_altsplice.tab.gz", comparison=comparisons_ids),
expand("results/edgeR/junctions/{comparison}_difffeature.tab.gz", comparison=comparisons_ids)
output:
"results/edgeR/junctions_results_complete.tab.gz",
"results/edgeR/junctions_results_fdr005.tab.gz",
run:
print("ok2")
1 change: 1 addition & 0 deletions splicekit/core/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def write_comparisons():
job_rmats_instance = job_rmats.format(container=splicekit.config.container, core_path=os.path.dirname(core.__file__), comparison_name=comparison_name, job_name="rmats_"+comparison_name, gtf_path=splicekit.config.gtf_path[:-3])
f_rmats.write(job_rmats_instance)
f_rmats.close()

row = [comparison_name, ",".join(str(el) for el in control_ids), ",".join(str(el) for el in test_ids), control_group_id, test_group_id]
comps_table.write("\t".join(row) + "\n")
comps_single = open(f"annotation/comparison_{comparison_name}.tab", "wt")
Expand Down

0 comments on commit 896a92d

Please sign in to comment.