Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove LSF-specific args, fix rule naming, add support for Ultima Genomics #75

Merged
merged 13 commits into from
Jan 30, 2024
Merged
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.Rproj*
*.Rproj
.RData
.Rhistory
.Rproj.user
*.DS_Store
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Notes: `total_impact` is set to 5 for each sample, change this to control how ma
| Platform | Library (BC+UMI+A) | Setting | Test data |
| :--------|:------------| :------------| :---------|
| 10x Chromium V3 | [16 + 12 + 30](https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium3.html) | chromiumV3 | ✓ |
| 10x V3 - Ultima Genomics | [adapter + 16 + 9 + 3 ignored + 8](https://www.nature.com/articles/s41587-022-01452-6/figures/1) | chromiumV3UG | |
| 10x Chromium V2 | [16 + 10 + 30](https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium3.html) | chromiumV2 | ✓ |
| 10x Chromium Visium | [16 + 10 + 30](https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium3.html) | visium | |
| Drop-seq | [12 + 8 + 30](https://teichlab.github.io/scg_lib_structs/methods_html/Drop-seq.html) | dropseq | ✓ |
Expand Down
17 changes: 11 additions & 6 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ configfile: "config.yaml"

DATA = config["DATA"]
RESULTS = config["RESULTS"]
PAIRED_SAMPLES = config["PAIRED_SAMPLES"]
R1_SAMPLES = config["R1_SAMPLES"]
R2_SAMPLES = config["R2_SAMPLES"]
STAR_INDEX = config["STAR_INDEX"]
Expand All @@ -14,12 +15,18 @@ FASTQS = config["FASTQS"]
WHITELIST_V2 = config["WHITELIST_V2"]
WHITELIST_V3 = config["WHITELIST_V3"]
STAR = config["STAR"]
READS = ["R1", "R2"]
READS = ["paired", "R1", "R2"]

import json
with open('chemistry.json') as fp:
chemistry = json.load(fp)

# paired positional approach
PAIRs = []
if PAIRED_SAMPLES:
PAIRs.extend(expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
PAIRs.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
PAIRs.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
# read 1 positional approach
R1s = []
if R1_SAMPLES:
Expand All @@ -33,16 +40,14 @@ if R2_SAMPLES:
R2s.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
R2s.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
# combine
R12s = R1s + R2s # +expand("{results}/report/multiqc_report.html", results = RESULTS)
R12s = R12s + expand("{data}/{sample}_R1.fastq.gz", data = DATA, sample = R1_SAMPLES) + expand("{data}/{sample}_R2.fastq.gz", data = DATA, sample = R1_SAMPLES)
print(R12s)
all_outputs = PAIRs + R1s + R2s #+ expand("{results}/report/multiqc_report.html", results = RESULTS)
print(all_outputs)

rule all:
input:
R12s = R12s
all_outputs = all_outputs

include: "rules/check_versions.snake"
include: "rules/download.snake"
include: "rules/cutadapt_star.snake"
include: "rules/count.snake"
include: "rules/qc.snake"
25 changes: 18 additions & 7 deletions chemistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,47 @@
"length": "58",
"bc_cut": "",
"R1": "[WHITELIST_V3,\"--soloUMIlen 12 --clip5pNbases 58 0 --soloCBstart 1 --soloCBlen 16 --soloUMIstart 17\"]",
"R2": "[WHITELIST_V3,\"--soloUMIlen 12\"]"
"R2": "[WHITELIST_V3,\"--soloUMIlen 12\"]",
"paired": "[\"--alignEndsProtrude 58 ConcordantPair\"]"
},
"chromiumV3UG": {
"length": "58",
"bc_cut": "",
"R1": "[WHITELIST_V3,\"--soloUMIlen 9 --clip5pNbases 58 --soloCBstart 23 --soloCBlen 16 --soloUMIstart 39\"]"
},
"chromiumV2": {
"length": "56",
"bc_cut": "",
"R1": "[WHITELIST_V2,\"--soloUMIlen 10 --clip5pNbases 56 0 --soloCBstart 1 --soloCBlen 16 --soloUMIstart 17\"]",
"R2": "[WHITELIST_V2,\"--soloUMIlen 10\"]"
"R2": "[WHITELIST_V2,\"--soloUMIlen 10\"]",
"paired": "[\"--alignEndsProtrude 56 ConcordantPair\"]"
},
"dropseq": {
"length": "50",
"bc_cut": "",
"R1": "[\"None --soloUMIlen 8 --clip5pNbases 50 0 --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13\"]",
"R2": "[\"None --soloUMIlen 8 --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13\"]"
"R2": "[\"None --soloUMIlen 8 --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13\"]",
"paired": "[\"--alignEndsProtrude 50 ConcordantPair\"]"
},
"microwellseq": {
"length": "54",
"bc_cut": "CGACTCACTACAGGG...TCGGTGACACGATCG",
"R1": "[\"None --soloUMIlen 6 --clip5pNbases 54 0 --soloCBstart 1 --soloCBlen 18 --soloUMIstart 19\"]",
"R2": "[\"None --soloUMIlen 6 --soloCBstart 1 --soloCBlen 18 --soloUMIstart 19\"]"
"R2": "[\"None --soloUMIlen 6 --soloCBstart 1 --soloCBlen 18 --soloUMIstart 19\"]",
"paired": "[\"--alignEndsProtrude 54 ConcordantPair\"]"
},
"bd": {
"length": "53",
"bc_cut": "ACTGGCCTGCGA...GGTAGCGGTGACA",
"R1": "[\"None --soloUMIlen 8 --clip5pNbases 53 0 --soloCBstart 1 --soloCBlen 27 --soloUMIstart 28\"]",
"R2": "[\"None --soloUMIlen 8 --soloCBstart 1 --soloCBlen 27 --soloUMIstart 28\"]"
"R2": "[\"None --soloUMIlen 8 --soloCBstart 1 --soloCBlen 27 --soloUMIstart 28\"]",
"paired": "[\"--alignEndsProtrude 53 ConcordantPair\"]"
},
"indrop": {
"length": "32",
"bc_cut": "",
"R1": "[\"None --soloUMIlen 6 --clip5pNbases 32 0 --soloCBstart 1 --soloCBlen 8 --soloUMIstart 9\"]",
"R2": "[\"None --soloUMIlen 6 --soloCBstart 1 --soloCBlen 8 --soloUMIstart 9\"]"
"R2": "[\"None --soloUMIlen 6 --soloCBstart 1 --soloCBlen 8 --soloUMIstart 9\"]",
"paired": "[\"--alignEndsProtrude 32 ConcordantPair\"]"
}
}
}
2 changes: 2 additions & 0 deletions chemistry_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import pandas
chromiumV3 = {"length" : "58", "bc_cut" : "", "R1" : '[WHITELIST_V3,"--soloUMIlen 12 --clip5pNbases 58 0 --soloCBstart 1 --soloCBlen 16 --soloUMIstart 17"]', "R2" : '[WHITELIST_V3,"--soloUMIlen 12"]'}

chromiumV3UG = {"length": "58", "bc_cut": "", "R1": '[WHITELIST_V3,"--soloUMIlen 9 --clip5pNbases 58 --soloCBstart 23 --soloCBlen 16 --soloUMIstart 39"]', "R2": '[WHITELIST_V3,"--soloUMIlen 9"]'}

chromiumV2 = {"length" : "56", "bc_cut" : "", "R1" : '[WHITELIST_V2,"--soloUMIlen 10 --clip5pNbases 56 0 --soloCBstart 1 --soloCBlen 16 --soloUMIstart 17"]', "R2" : '[WHITELIST_V2,"--soloUMIlen 10"]'}

dropseq = {"length" : "50", "bc_cut" : "", "R1" : '["None --soloUMIlen 8 --clip5pNbases 50 0 --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13"]', "R2" : '["None --soloUMIlen 8 --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13"]'}
Expand Down
5 changes: 4 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,15 @@ POLYA_SITES:
FASTQS:
"sample_fastqs.tsv"

R1_SAMPLES:
PAIRED_SAMPLES:
# Sample basenames
- test
- test2
- test3

R1_SAMPLES:
# Sample basenames

R2_SAMPLES:
- test
- test2
Expand Down
24 changes: 15 additions & 9 deletions inst/scripts/filter_bam_correct.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
suitable for cellranger and starsolo output
"""

def correct_bam_read1(bam, outbam, target_len, filter_cut):
def correct_bam_read1(bam, outbam, target_len, filter_cut, single_end):

samfile = pysam.AlignmentFile(bam, "rb")
outfile = pysam.AlignmentFile(outbam, "wb", template = samfile)
Expand All @@ -28,14 +28,16 @@ def correct_bam_read1(bam, outbam, target_len, filter_cut):
if read.is_unmapped:
# not mapped, toss
continue

if not read.is_proper_pair:

if not single_end:
if not read.is_proper_pair:
# not properly paired, toss
continue
continue

if not read.is_read1:
if not single_end:
if not read.is_read1:
# not read1, toss
continue
continue

j += 1

Expand Down Expand Up @@ -115,16 +117,20 @@ def main():
help ="""maximum nts off of target_len to keep""",
default = 10,
required = False)

parser.add_argument('-s',
'--single',
action="store_true",
required = False)
args=parser.parse_args()

in_bam = args.inbam
out_bam = args.outbam
target_len = int(args.targetlen)
filter_cut = float(args.filtercut)

single_end = args.single
print(single_end)
print("settings- target_len: ", target_len, "; filter_cut: ", filter_cut)
k,i,j = correct_bam_read1(in_bam, out_bam, target_len = target_len, filter_cut = filter_cut)
k,i,j = correct_bam_read1(in_bam, out_bam, target_len = target_len, filter_cut = filter_cut, single_end = single_end)
print("fraction of reads kept: ", i/j)
print("fraction of reads corrected: ", k/i)

Expand Down
3 changes: 2 additions & 1 deletion rules/check_versions.snake
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ rule check:
checked = "{data}/check_versions.txt"
params:
job_name = "check",
memory = "select[mem>5] rusage[mem=5]", # LSF format; change as needed
log:
"{data}/logs/check_versions.txt"
resources:
mem_mb = 5000
threads:
1
shell:
Expand Down
Loading
Loading