Merge pull request #551 from broadinstitute/dp-demux

demux_deplete: make depletion optional, load biosample metadata to tables
broadinstitute · Aug 21, 2024 · 5623df0 · 5623df0
2 parents bb3cbe6 + 1d747a4
commit 5623df0
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 221 deletions.
diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl
@@ -479,12 +479,15 @@ task sra_meta_prep {
     lib_to_bams = {}
     sample_to_biosample = {}
     for bam in bam_uris:
-      # filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam
-      assert bam.endswith('.cleaned.bam'), "filename does not end in .cleaned.bam: {}".format(bam)
+      # filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam or <libraryname>.<flowcell>.<lane>.bam
       bam_base = os.path.basename(bam)
       bam_parts = bam_base.split('.')
-      assert len(bam_parts) >= 5, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
-      lib = '.'.join(bam_parts[:-4])
+      assert bam_parts[-1] == 'bam', "filename does not end in .bam -- {}".format(bam) 
+      bam_parts = bam_parts[:-1]
+      if bam_parts[-1] == 'cleaned':
+        bam_parts = bam_parts[:-1]
+      assert len(bam_parts) >= 3, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
+      lib = '.'.join(bam_parts[:-2]) # drop flowcell and lane
       lib_to_bams.setdefault(lib, [])
       lib_to_bams[lib].append(bam_base)
       print("debug: registering lib={} bam={}".format(lib, bam_base))
@@ -554,6 +557,90 @@ task sra_meta_prep {
   }
 }
 
+task biosample_to_table {
+  meta {
+    description: "Reformats a BioSample registration attributes table (attributes.tsv) for ingest into a Terra table."
+  }
+  input {
+    File        biosample_attributes_tsv
+    Array[File] cleaned_bam_filepaths
+    File        demux_meta_json
+
+    String  sample_table_name  = "sample"
+    String  docker = "python:slim"
+  }
+  String  sanitized_id_col = "entity:~{sample_table_name}_id"
+  String base = basename(basename(biosample_attributes_tsv, ".txt"), ".tsv")
+  parameter_meta {
+    cleaned_bam_filepaths: {
+      description: "Unaligned bam files containing cleaned (submittable) reads.",
+      localization_optional: true,
+      stream: true,
+      patterns: ["*.bam"]
+    }
+  }
+  command <<<
+    set -ex -o pipefail
+    python3 << CODE
+    import os.path
+    import csv
+    import json
+
+    # load demux metadata
+    with open("~{demux_meta_json}", 'rt') as inf:
+      demux_meta_by_file = json.load(inf)
+
+    # load list of bams surviving filters
+    bam_fnames = list(os.path.basename(x) for x in '~{sep="*" cleaned_bam_filepaths}'.split('*'))
+    bam_fnames = list(x[:-len('.bam')] if x.endswith('.bam') else x for x in bam_fnames)
+    bam_fnames = list(x[:-len('.cleaned')] if x.endswith('.cleaned') else x for x in bam_fnames)
+    print("bam basenames ({}): {}".format(len(bam_fnames), bam_fnames))
+    sample_to_sanitized = {demux_meta_by_file.get(x, {}).get('sample_original'): demux_meta_by_file.get(x, {}).get('sample') for x in bam_fnames}
+    if None in sample_to_sanitized:
+      del sample_to_sanitized[None]
+    sample_names_seen = sample_to_sanitized.keys()
+    print("samples seen ({}): {}".format(len(sample_names_seen), sorted(sample_names_seen)))
+
+    # load biosample metadata
+    biosample_attributes = []
+    biosample_headers = ['biosample_accession']
+    with open('~{biosample_attributes_tsv}', 'rt') as inf:
+      for row in csv.DictReader(inf, delimiter='\t'):
+        if row['sample_name'] in sample_names_seen and row['message'] == "Successfully loaded":
+          row['biosample_accession'] = row.get('accession')
+          biosample_attributes.append(row)
+          for k,v in row.items():
+            if v.strip().lower() in ('missing', 'na', 'not applicable', 'not collected', ''):
+              v = None
+            if v and (k not in biosample_headers) and k not in ('message', 'accession'):
+              biosample_headers.append(k)
+    print("biosample headers ({}): {}".format(len(biosample_headers), biosample_headers))
+    print("biosample rows ({})".format(len(biosample_attributes)))
+    samples_seen_without_biosample = set(sample_names_seen) - set(row['sample_name'] for row in biosample_attributes)
+    print("samples seen in bams without biosample entries ({}): {}".format(len(samples_seen_without_biosample), sorted(samples_seen_without_biosample)))
+
+    # write reformatted table
+    with open('~{base}.entities.tsv', 'wt') as outf:
+      writer = csv.DictWriter(outf, delimiter='\t', fieldnames=["~{sanitized_id_col}"]+biosample_headers, quoting=csv.QUOTE_MINIMAL)
+      writer.writeheader()
+      for row in biosample_attributes:
+        outrow = {h: row[h] for h in biosample_headers}
+        outrow["~{sanitized_id_col}"] = sample_to_sanitized[row['sample_name']]
+        writer.writerow(outrow)
+    CODE
+  >>>
+  output {
+    File sample_meta_tsv = "~{base}.entities.tsv"
+  }
+  runtime {
+    docker: docker
+    memory: "1 GB"
+    cpu: 1
+    dx_instance_type: "mem1_ssd1_v2_x2"
+    maxRetries: 2
+  }
+}
+
 task biosample_to_genbank {
   meta {
     description: "Prepares two input metadata files for Genbank submission based on a BioSample registration attributes table (attributes.tsv) since all of the necessary values are there. This produces both a Genbank Source Modifier Table and a BioSample ID map file that can be fed into the prepare_genbank task."

diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl
@@ -393,7 +393,7 @@ task align_and_count {
     File   ref_db
     Int    topNHits = 3
 
-    Boolean filter_bam_to_proper_primary_mapped_reads         = false
+    Boolean filter_bam_to_proper_primary_mapped_reads         = true
     Boolean do_not_require_proper_mapped_pairs_when_filtering = false
     Boolean keep_singletons_when_filtering                    = false
     Boolean keep_duplicates_when_filtering                    = false
@@ -456,6 +456,8 @@ task align_and_count {
 
     TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT)
     TOTAL_COUNT_OF_LESSER_HITS=$((grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" || echo "0" ) | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS)
+    echo $TOTAL_COUNT_OF_TOP_HIT | tee TOTAL_COUNT_OF_TOP_HIT
+    echo $TOTAL_COUNT_OF_LESSER_HITS | tee TOTAL_COUNT_OF_LESSER_HITS
 
     if [ $TOTAL_COUNT_OF_LESSER_HITS -ne 0 -o $TOTAL_COUNT_OF_TOP_HIT -ne 0 ]; then
       PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \
@@ -466,6 +468,7 @@ task align_and_count {
     fi
 
     TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam")
+    echo $TOTAL_READS_IN_INPUT | tee TOTAL_READS_IN_INPUT
     if [ $TOTAL_READS_IN_INPUT -eq 0 ]; then
       echo "no reads in input bam"
       PCT_OF_INPUT_READS_MAPPED=$(echo "0" | tee "~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt")
@@ -480,7 +483,11 @@ task align_and_count {
     
     File   report_top_hits  = "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
     String top_hit_id       = read_string("~{reads_basename}.count.~{ref_basename}.top.txt")
-    
+
+    Int    reads_total          = read_int("TOTAL_READS_IN_INPUT")
+    Int    reads_mapped_top_hit = read_int("TOTAL_COUNT_OF_TOP_HIT")
+    Int    reads_mapped         = read_int("TOTAL_COUNT_OF_LESSER_HITS") + read_int("TOTAL_COUNT_OF_TOP_HIT")
+
     String pct_total_reads_mapped    = read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt')
     String pct_lesser_hits_of_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')