Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

demux_deplete: make depletion optional, load biosample metadata to tables #551

Merged
merged 30 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9417803
make depletion databases optional for demux_deplete
dpark01 Aug 19, 2024
664c788
re-order independent code bocks
dpark01 Aug 19, 2024
7b447fd
initial attempt at biosample to sample table percolation
dpark01 Aug 19, 2024
246136c
oops bugfix input to multiqc cleaned
dpark01 Aug 20, 2024
e0b4f99
less noisy outputs
dpark01 Aug 20, 2024
76d560f
make sra_meta_prep tolerant of bam files without ".cleaned" in the fi…
dpark01 Aug 20, 2024
e0f2fe8
syntax error
dpark01 Aug 20, 2024
40fbbc4
restore output needed for other workflow
dpark01 Aug 20, 2024
cd14cce
syntax error again
dpark01 Aug 20, 2024
05ac4cf
debug output
dpark01 Aug 20, 2024
27bc946
bugfix string handling
dpark01 Aug 20, 2024
05e43e2
fix
dpark01 Aug 20, 2024
5bbbc75
bug fix
dpark01 Aug 20, 2024
0e9fe90
more debug
dpark01 Aug 20, 2024
26e67f3
some code simplification of terra.create_or_update_sample_tables
dpark01 Aug 20, 2024
7b51e63
more refactor/clean
dpark01 Aug 20, 2024
2a118d7
update default
dpark01 Aug 20, 2024
24ac092
take sample original name as single value not array
dpark01 Aug 20, 2024
a94f5dc
cleanup, bugfix, debug
dpark01 Aug 20, 2024
9f7aa70
remove vestigal input
dpark01 Aug 20, 2024
1722a0c
bugfix
dpark01 Aug 20, 2024
2447f38
bugfix
dpark01 Aug 20, 2024
7e933e5
debug
dpark01 Aug 20, 2024
74d6d57
try to fix edge case
dpark01 Aug 20, 2024
95a5171
code cleanups
dpark01 Aug 20, 2024
6eb1940
debug
dpark01 Aug 21, 2024
87b568a
properly handle NA/NaN values in pandas dataframe
dpark01 Aug 21, 2024
7ccffdb
move code blocks closer together
dpark01 Aug 21, 2024
f588bbd
code cleanup and reporting lines
dpark01 Aug 21, 2024
1d747a4
parameterize table names
dpark01 Aug 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 91 additions & 4 deletions pipes/WDL/tasks/tasks_ncbi.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -479,12 +479,15 @@ task sra_meta_prep {
lib_to_bams = {}
sample_to_biosample = {}
for bam in bam_uris:
# filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam
assert bam.endswith('.cleaned.bam'), "filename does not end in .cleaned.bam: {}".format(bam)
# filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam or <libraryname>.<flowcell>.<lane>.bam
bam_base = os.path.basename(bam)
bam_parts = bam_base.split('.')
assert len(bam_parts) >= 5, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
lib = '.'.join(bam_parts[:-4])
assert bam_parts[-1] == 'bam', "filename does not end in .bam -- {}".format(bam)
bam_parts = bam_parts[:-1]
if bam_parts[-1] == 'cleaned':
bam_parts = bam_parts[:-1]
assert len(bam_parts) >= 3, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
lib = '.'.join(bam_parts[:-2]) # drop flowcell and lane
lib_to_bams.setdefault(lib, [])
lib_to_bams[lib].append(bam_base)
print("debug: registering lib={} bam={}".format(lib, bam_base))
Expand Down Expand Up @@ -554,6 +557,90 @@ task sra_meta_prep {
}
}

task biosample_to_table {
meta {
description: "Reformats a BioSample registration attributes table (attributes.tsv) for ingest into a Terra table."
}
input {
File biosample_attributes_tsv
Array[File] cleaned_bam_filepaths
File demux_meta_json

String sample_table_name = "sample"
String docker = "python:slim"
}
String sanitized_id_col = "entity:~{sample_table_name}_id"
String base = basename(basename(biosample_attributes_tsv, ".txt"), ".tsv")
parameter_meta {
cleaned_bam_filepaths: {
description: "Unaligned bam files containing cleaned (submittable) reads.",
localization_optional: true,
stream: true,
patterns: ["*.bam"]
}
}
command <<<
set -ex -o pipefail
python3 << CODE
import os.path
import csv
import json

# load demux metadata
with open("~{demux_meta_json}", 'rt') as inf:
demux_meta_by_file = json.load(inf)

# load list of bams surviving filters
bam_fnames = list(os.path.basename(x) for x in '~{sep="*" cleaned_bam_filepaths}'.split('*'))
bam_fnames = list(x[:-len('.bam')] if x.endswith('.bam') else x for x in bam_fnames)
bam_fnames = list(x[:-len('.cleaned')] if x.endswith('.cleaned') else x for x in bam_fnames)
print("bam basenames ({}): {}".format(len(bam_fnames), bam_fnames))
sample_to_sanitized = {demux_meta_by_file.get(x, {}).get('sample_original'): demux_meta_by_file.get(x, {}).get('sample') for x in bam_fnames}
if None in sample_to_sanitized:
del sample_to_sanitized[None]
sample_names_seen = sample_to_sanitized.keys()
print("samples seen ({}): {}".format(len(sample_names_seen), sorted(sample_names_seen)))

# load biosample metadata
biosample_attributes = []
biosample_headers = ['biosample_accession']
with open('~{biosample_attributes_tsv}', 'rt') as inf:
for row in csv.DictReader(inf, delimiter='\t'):
if row['sample_name'] in sample_names_seen and row['message'] == "Successfully loaded":
row['biosample_accession'] = row.get('accession')
biosample_attributes.append(row)
for k,v in row.items():
if v.strip().lower() in ('missing', 'na', 'not applicable', 'not collected', ''):
v = None
if v and (k not in biosample_headers) and k not in ('message', 'accession'):
biosample_headers.append(k)
print("biosample headers ({}): {}".format(len(biosample_headers), biosample_headers))
print("biosample rows ({})".format(len(biosample_attributes)))
samples_seen_without_biosample = set(sample_names_seen) - set(row['sample_name'] for row in biosample_attributes)
print("samples seen in bams without biosample entries ({}): {}".format(len(samples_seen_without_biosample), sorted(samples_seen_without_biosample)))

# write reformatted table
with open('~{base}.entities.tsv', 'wt') as outf:
writer = csv.DictWriter(outf, delimiter='\t', fieldnames=["~{sanitized_id_col}"]+biosample_headers, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
for row in biosample_attributes:
outrow = {h: row[h] for h in biosample_headers}
outrow["~{sanitized_id_col}"] = sample_to_sanitized[row['sample_name']]
writer.writerow(outrow)
CODE
>>>
output {
File sample_meta_tsv = "~{base}.entities.tsv"
}
runtime {
docker: docker
memory: "1 GB"
cpu: 1
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
}

task biosample_to_genbank {
meta {
description: "Prepares two input metadata files for Genbank submission based on a BioSample registration attributes table (attributes.tsv) since all of the necessary values are there. This produces both a Genbank Source Modifier Table and a BioSample ID map file that can be fed into the prepare_genbank task."
Expand Down
11 changes: 9 additions & 2 deletions pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ task align_and_count {
File ref_db
Int topNHits = 3

Boolean filter_bam_to_proper_primary_mapped_reads = false
Boolean filter_bam_to_proper_primary_mapped_reads = true
Boolean do_not_require_proper_mapped_pairs_when_filtering = false
Boolean keep_singletons_when_filtering = false
Boolean keep_duplicates_when_filtering = false
Expand Down Expand Up @@ -456,6 +456,8 @@ task align_and_count {

TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT)
TOTAL_COUNT_OF_LESSER_HITS=$((grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" || echo "0" ) | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS)
echo $TOTAL_COUNT_OF_TOP_HIT | tee TOTAL_COUNT_OF_TOP_HIT
echo $TOTAL_COUNT_OF_LESSER_HITS | tee TOTAL_COUNT_OF_LESSER_HITS

if [ $TOTAL_COUNT_OF_LESSER_HITS -ne 0 -o $TOTAL_COUNT_OF_TOP_HIT -ne 0 ]; then
PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \
Expand All @@ -466,6 +468,7 @@ task align_and_count {
fi

TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam")
echo $TOTAL_READS_IN_INPUT | tee TOTAL_READS_IN_INPUT
if [ $TOTAL_READS_IN_INPUT -eq 0 ]; then
echo "no reads in input bam"
PCT_OF_INPUT_READS_MAPPED=$(echo "0" | tee "~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt")
Expand All @@ -480,7 +483,11 @@ task align_and_count {

File report_top_hits = "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
String top_hit_id = read_string("~{reads_basename}.count.~{ref_basename}.top.txt")


Int reads_total = read_int("TOTAL_READS_IN_INPUT")
Int reads_mapped_top_hit = read_int("TOTAL_COUNT_OF_TOP_HIT")
Int reads_mapped = read_int("TOTAL_COUNT_OF_LESSER_HITS") + read_int("TOTAL_COUNT_OF_TOP_HIT")

String pct_total_reads_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt')
String pct_lesser_hits_of_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')

Expand Down
Loading