Skip to content

Commit

Permalink
Merge pull request #551 from broadinstitute/dp-demux
Browse files Browse the repository at this point in the history
demux_deplete: make depletion optional, load biosample metadata to tables
  • Loading branch information
dpark01 authored Aug 21, 2024
2 parents bb3cbe6 + 1d747a4 commit 5623df0
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 221 deletions.
95 changes: 91 additions & 4 deletions pipes/WDL/tasks/tasks_ncbi.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -479,12 +479,15 @@ task sra_meta_prep {
lib_to_bams = {}
sample_to_biosample = {}
for bam in bam_uris:
# filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam
assert bam.endswith('.cleaned.bam'), "filename does not end in .cleaned.bam: {}".format(bam)
# filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam or <libraryname>.<flowcell>.<lane>.bam
bam_base = os.path.basename(bam)
bam_parts = bam_base.split('.')
assert len(bam_parts) >= 5, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
lib = '.'.join(bam_parts[:-4])
assert bam_parts[-1] == 'bam', "filename does not end in .bam -- {}".format(bam)
bam_parts = bam_parts[:-1]
if bam_parts[-1] == 'cleaned':
bam_parts = bam_parts[:-1]
assert len(bam_parts) >= 3, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
lib = '.'.join(bam_parts[:-2]) # drop flowcell and lane
lib_to_bams.setdefault(lib, [])
lib_to_bams[lib].append(bam_base)
print("debug: registering lib={} bam={}".format(lib, bam_base))
Expand Down Expand Up @@ -554,6 +557,90 @@ task sra_meta_prep {
}
}
task biosample_to_table {
meta {
description: "Reformats a BioSample registration attributes table (attributes.tsv) for ingest into a Terra table."
}
input {
File biosample_attributes_tsv
Array[File] cleaned_bam_filepaths
File demux_meta_json
String sample_table_name = "sample"
String docker = "python:slim"
}
String sanitized_id_col = "entity:~{sample_table_name}_id"
String base = basename(basename(biosample_attributes_tsv, ".txt"), ".tsv")
parameter_meta {
cleaned_bam_filepaths: {
description: "Unaligned bam files containing cleaned (submittable) reads.",
localization_optional: true,
stream: true,
patterns: ["*.bam"]
}
}
command <<<
set -ex -o pipefail
python3 << CODE
import os.path
import csv
import json
# load demux metadata
with open("~{demux_meta_json}", 'rt') as inf:
demux_meta_by_file = json.load(inf)
# load list of bams surviving filters
bam_fnames = list(os.path.basename(x) for x in '~{sep="*" cleaned_bam_filepaths}'.split('*'))
bam_fnames = list(x[:-len('.bam')] if x.endswith('.bam') else x for x in bam_fnames)
bam_fnames = list(x[:-len('.cleaned')] if x.endswith('.cleaned') else x for x in bam_fnames)
print("bam basenames ({}): {}".format(len(bam_fnames), bam_fnames))
sample_to_sanitized = {demux_meta_by_file.get(x, {}).get('sample_original'): demux_meta_by_file.get(x, {}).get('sample') for x in bam_fnames}
if None in sample_to_sanitized:
del sample_to_sanitized[None]
sample_names_seen = sample_to_sanitized.keys()
print("samples seen ({}): {}".format(len(sample_names_seen), sorted(sample_names_seen)))
# load biosample metadata
biosample_attributes = []
biosample_headers = ['biosample_accession']
with open('~{biosample_attributes_tsv}', 'rt') as inf:
for row in csv.DictReader(inf, delimiter='\t'):
if row['sample_name'] in sample_names_seen and row['message'] == "Successfully loaded":
row['biosample_accession'] = row.get('accession')
biosample_attributes.append(row)
for k,v in row.items():
if v.strip().lower() in ('missing', 'na', 'not applicable', 'not collected', ''):
v = None
if v and (k not in biosample_headers) and k not in ('message', 'accession'):
biosample_headers.append(k)
print("biosample headers ({}): {}".format(len(biosample_headers), biosample_headers))
print("biosample rows ({})".format(len(biosample_attributes)))
samples_seen_without_biosample = set(sample_names_seen) - set(row['sample_name'] for row in biosample_attributes)
print("samples seen in bams without biosample entries ({}): {}".format(len(samples_seen_without_biosample), sorted(samples_seen_without_biosample)))
# write reformatted table
with open('~{base}.entities.tsv', 'wt') as outf:
writer = csv.DictWriter(outf, delimiter='\t', fieldnames=["~{sanitized_id_col}"]+biosample_headers, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
for row in biosample_attributes:
outrow = {h: row[h] for h in biosample_headers}
outrow["~{sanitized_id_col}"] = sample_to_sanitized[row['sample_name']]
writer.writerow(outrow)
CODE
>>>
output {
File sample_meta_tsv = "~{base}.entities.tsv"
}
runtime {
docker: docker
memory: "1 GB"
cpu: 1
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
}
task biosample_to_genbank {
meta {
description: "Prepares two input metadata files for Genbank submission based on a BioSample registration attributes table (attributes.tsv) since all of the necessary values are there. This produces both a Genbank Source Modifier Table and a BioSample ID map file that can be fed into the prepare_genbank task."
Expand Down
11 changes: 9 additions & 2 deletions pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ task align_and_count {
File ref_db
Int topNHits = 3
Boolean filter_bam_to_proper_primary_mapped_reads = false
Boolean filter_bam_to_proper_primary_mapped_reads = true
Boolean do_not_require_proper_mapped_pairs_when_filtering = false
Boolean keep_singletons_when_filtering = false
Boolean keep_duplicates_when_filtering = false
Expand Down Expand Up @@ -456,6 +456,8 @@ task align_and_count {
TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT)
TOTAL_COUNT_OF_LESSER_HITS=$((grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" || echo "0" ) | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS)
echo $TOTAL_COUNT_OF_TOP_HIT | tee TOTAL_COUNT_OF_TOP_HIT
echo $TOTAL_COUNT_OF_LESSER_HITS | tee TOTAL_COUNT_OF_LESSER_HITS
if [ $TOTAL_COUNT_OF_LESSER_HITS -ne 0 -o $TOTAL_COUNT_OF_TOP_HIT -ne 0 ]; then
PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \
Expand All @@ -466,6 +468,7 @@ task align_and_count {
fi
TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam")
echo $TOTAL_READS_IN_INPUT | tee TOTAL_READS_IN_INPUT
if [ $TOTAL_READS_IN_INPUT -eq 0 ]; then
echo "no reads in input bam"
PCT_OF_INPUT_READS_MAPPED=$(echo "0" | tee "~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt")
Expand All @@ -480,7 +483,11 @@ task align_and_count {
File report_top_hits = "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
String top_hit_id = read_string("~{reads_basename}.count.~{ref_basename}.top.txt")
Int reads_total = read_int("TOTAL_READS_IN_INPUT")
Int reads_mapped_top_hit = read_int("TOTAL_COUNT_OF_TOP_HIT")
Int reads_mapped = read_int("TOTAL_COUNT_OF_LESSER_HITS") + read_int("TOTAL_COUNT_OF_TOP_HIT")
String pct_total_reads_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt')
String pct_lesser_hits_of_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')
Expand Down
Loading

0 comments on commit 5623df0

Please sign in to comment.