Merge pull request #56 from getzlab/cram-fixes

Cram fixes
getzlab · Mar 4, 2021 · d34c139 · d34c139
2 parents a458d7c + 254e2ae
commit d34c139
Show file tree

Hide file tree

Showing 14 changed files with 403 additions and 139 deletions.
diff --git a/Makefile b/Makefile
@@ -40,7 +40,7 @@ clean:
 
 .PHONY: test
 
-test: test-version test-single test-chr1 test-downsampled test-legacy test-expected-failures
+test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures
 	echo Tests Complete
 
 .PHONY: test-version
@@ -112,12 +112,14 @@ test-legacy: rnaseqc
 .PHONY: test-crams
 
 test-crams: rnaseqc
+	touch test_data/chr1.fasta.fai
 	./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta
-	python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.cram chr1.bam
+	python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_
 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct test_data/chr1.output/chr1.bam.gene_reads.gct -m tables -c Counts Counts_
 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct test_data/chr1.output/chr1.bam.gene_tpm.gct -m tables -c TPM TPM_
 	python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct test_data/chr1.output/chr1.bam.exon_reads.gct -m tables -c Counts Counts_
 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct test_data/chr1.output/chr1.bam.gene_fragments.gct -m tables -c Fragments Fragments_
+	python3 test_data/approx_diff.py .test_output/chr1.cram.gc_content.tsv test_data/chr1.output/chr1.cram.gc_content.tsv -m metrics -c Count Count_
 	sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv
 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_

diff --git a/python/rnaseqc/__init__.py b/python/rnaseqc/__init__.py
@@ -1 +1 @@
-__version__ = '0.0.2'
+__version__ = '0.0.3'
diff --git a/python/rnaseqc/legacy_exon_remap.py b/python/rnaseqc/legacy_exon_remap.py
@@ -19,45 +19,43 @@ def run(args):
     writer.writeheader()
     current = None
     features = []
-    with status_bar(numRows) as bar:
-        for line in reader:
-            bar.update(bar.current + 1)
-            gene = '_'.join(line['Name'].split('_')[:-1])
-            if gene != current:
-                if current is not None:
-                    ref = gtf.get_gene(current)
+    for line in status_bar.iter(reader, maximum=numRows):
+        gene = '_'.join(line['Name'].split('_')[:-1])
+        if gene != current:
+            if current is not None:
+                ref = gtf.get_gene(current)
+                try:
+                    if len(ref):
+                        ref = ref[0]
+                except:
+                    pass
+                exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons}
+                raw_size = len(exons)
+                for exon in [exon for exon in exons]:
                     try:
-                        if len(ref):
-                            ref = ref[0]
+                        if exon.isdigit() and int(exon) <= raw_size:
+                            exons[current+'_'+exon] = exons[exon]
                     except:
                         pass
-                    exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons}
-                    raw_size = len(exons)
-                    for exon in [exon for exon in exons]:
-                        try:
-                            if exon.isdigit() and int(exon) <= raw_size:
-                                exons[current+'_'+exon] = exons[exon]
-                        except:
-                            pass
-                    features.sort(
-                        key=lambda feat:(
-                            1 if exons[feat['Name']].length == 1 else 0,
-                            exons[feat['Name']].start_pos,
-                            exons[feat['Name']].end_pos
-                        )
+                features.sort(
+                    key=lambda feat:(
+                        1 if exons[feat['Name']].length == 1 else 0,
+                        exons[feat['Name']].start_pos,
+                        exons[feat['Name']].end_pos
                     )
-                    for i in range(len(features)):
-                        parts = features[i]['Name'].split('_')
-                        prefix = '_'.join(parts[:-1])
-                        suffix = parts[-1]
-                        if exons[features[i]['Name']].length == 1:
-                            features[i][reader.fieldnames[-1]] = 0
-                        suffix = str(i)
-                        features[i]['Name'] = prefix+'_'+suffix
-                    writer.writerows(features)
-                current = gene
-                features = []
-            features.append({k:v for k,v in line.items()})
+                )
+                for i in range(len(features)):
+                    parts = features[i]['Name'].split('_')
+                    prefix = '_'.join(parts[:-1])
+                    suffix = parts[-1]
+                    if exons[features[i]['Name']].length == 1:
+                        features[i][reader.fieldnames[-1]] = 0
+                    suffix = str(i)
+                    features[i]['Name'] = prefix+'_'+suffix
+                writer.writerows(features)
+            current = gene
+            features = []
+        features.append({k:v for k,v in line.items()})
     if len(features):
         ref = gtf.get_gene(current)
         try:

diff --git a/src/BamReader.cpp b/src/BamReader.cpp
@@ -13,8 +13,16 @@ namespace rnaseqc {
     {
         // Must uncomment before adding multithreading
         //    std::lock_guard<SeqlibReader> guard(*this);
-        bool ok = this->bam.GetNextRecord(read);
-        if (ok) this->read_count++;
-        return ok;
+        try {
+            bool ok = this->bam.GetNextRecord(read);
+            if (ok) this->read_count++;
+            return ok;
+        }
+        catch (std::runtime_error &e) {
+            if (this->user_cram_reference) throw referenceHTSMismatch(std::string("HTSLib was unable to find a suitable reference while decoding a cram: ")+e.what());
+            throw;
+        }
+        return false; // No way to get here
+
     }
 }
diff --git a/src/BamReader.h b/src/BamReader.h
@@ -9,14 +9,24 @@
 #ifndef BamReader_h
 #define BamReader_h
 
+#include "Fasta.h"
 #include <stdio.h>
 #include <mutex>
 #include <string>
+#include <set>
 #include <SeqLib/BamReader.h>
 #include <SeqLib/BamHeader.h>
 #include <SeqLib/BamRecord.h>
+#include <htslib/cram/cram.h> // I really don't like using unofficial APIs, but not much choice here.
 
 namespace rnaseqc {
+
+    struct referenceHTSMismatch : public std::exception {
+        std::string error;
+        referenceHTSMismatch(std::string msg) : error(msg) {};
+    };
+
+
     class SynchronizedReader {
         std::mutex mtx;
     protected:
@@ -44,9 +54,12 @@ namespace rnaseqc {
 
     class SeqlibReader : public SynchronizedReader {
         SeqLib::BamReader bam;
+        std::string reference_path;
+        std::set<chrom> valid_chroms;
+        bool user_cram_reference;
     public:
-        SeqlibReader() : bam() {
-        }
+
+        SeqlibReader() : reference_path(), valid_chroms(), user_cram_reference(false) {}
 
         bool next(SeqLib::BamRecord&);
 
@@ -55,12 +68,36 @@ namespace rnaseqc {
         }
 
         bool open(std::string filepath) {
+            if (this->reference_path.length()) {
+                auto htsfile = hts_open(filepath.c_str(), "r");
+                hts_set_fai_filename(htsfile, this->reference_path.c_str());
+                if (htsfile->format.format == htsExactFormat::cram) {
+                    this->user_cram_reference = true;
+                    // Cram handling is very dumb. All of this nonsense is just because htslib is incredibly opaque about reference handling
+                    // Even with a user-provided reference, htslib only uses it if the MD5 matches
+                    // So here we load up the file, check if it's a cram, then get a list of chromosomes that htslib decides to use
+                    cram_fd *cram = static_cast<cram_fd*>(htsfile->fp.cram);
+                    if (cram->refs && cram->refs->nref > 0)
+                        for (unsigned int i = 0; i < cram->refs->nref; ++i)
+                            if (this->reference_path == std::string(cram->refs->ref_id[i]->fn))
+                                this->valid_chroms.insert(
+                                    chromosomeMap(cram->refs->ref_id[i]->name)
+                                );
+                    this->bam.SetCramReference(this->reference_path); // Consider moving out of if statement, if there's any meaningful use to having a reference set on a non-cram
+                }
+                hts_close(htsfile);
+            }
             this->bam.Open(filepath);
             return this->bam.IsOpen();
         }
 
         void addReference(std::string filepath) {
-            this->bam.SetCramReference(filepath);
+            this->reference_path = filepath;
+        }
+
+        inline bool validateChromosome(const chrom c) {
+            // For crams, we only validate chromosomes which matched our reference. Otherwise yes!
+            return this->user_cram_reference ? this->valid_chroms.count(c) > 0 : true;
         }
 
     };

diff --git a/src/Expression.cpp b/src/Expression.cpp
@@ -305,7 +305,7 @@ namespace rnaseqc {
 
     // New version of exon metrics
     // More efficient and less buggy
-    void exonAlignmentMetrics(map<chrom, list<Feature>> &features, Metrics &counter, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable, unsigned int length, Strand orientation, BaseCoverage &baseCoverage, const bool highQuality, const bool singleEnd)
+    double exonAlignmentMetrics(map<chrom, list<Feature>> &features, Metrics &counter, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable, unsigned int length, Strand orientation, BaseCoverage &baseCoverage, const bool highQuality, const bool singleEnd, map<string, FragmentMateEntry> &fragments, Fasta &fastaReader)
     {
         string chrName = sequenceTable[alignment.ChrID()].Name;
         chrom chr = chromosomeMap(chrName); //generate the chromosome shorthand name
@@ -316,6 +316,7 @@ namespace rnaseqc {
         current.end = alignment.PositionEnd(); //0-based, open == 1-based, closed
 
         vector<set<string> > genes; //each set is the set of genes intersected by the current block (one set per block)
+        set<string> alignedExons; // Record of all aligned exons (make sure all blocks align to same exon for gc content)
         Collector exonCoverageCollector(&exonCounts); //Collects coverage counts for later (counts may be discarded)
         bool intragenic = false, transcriptPlus = false, transcriptMinus = false, ribosomal = false, doExonMetrics = false, exonic = false; //various booleans for keeping track of the alignment
 
@@ -344,7 +345,7 @@ namespace rnaseqc {
                         double tmp = static_cast<double>(intersectionSize) / length;
                         exonCoverageCollector.add(result->gene_id, result->feature_id, tmp);
                         baseCoverage.add(*result, block->start, block->end); //provisionally add per-base coverage to this gene
-
+                        alignedExons.insert(result->feature_id);
                     }
 
                 }
@@ -455,10 +456,30 @@ namespace rnaseqc {
             }
         }
         baseCoverage.reset();
+        if (fastaReader.hasContig(chr) && highQuality && exonic && doExonMetrics && alignedExons.size() == 1 && blocks.size() == 1 && fabs(alignment.InsertSize()) > 100 && fabs(alignment.InsertSize()) < 1000) {
+            string exonName = *(alignedExons.begin());
+            auto fragment = fragments.find(alignment.Qname());
+            if (fragment == fragments.end()) //first time we've encountered a read in this pair
+            {
+                // Record the exon we aligned to and the actual end of the read
+                fragments[alignment.Qname()] = std::make_tuple(exonName, alignment.PositionEnd());
+            }
+            else if (exonName == std::get<EXON>(fragment->second)) //second time we've encountered a read in this pair
+            {
+
+                //Check that we end after the mate ends, and that we aren't aligned to the same start position
+                if (alignment.PositionEnd() <= std::get<ENDPOS>(fragment->second) || alignment.Position() == alignment.MatePosition()) return -1;
+                //This pair is useable for fragment statistics:  both pairs fully aligned to the same exon
+                string seq = fastaReader.getSeq(chr, std::get<ENDPOS>(fragment->second) - alignment.Length(), alignment.PositionEnd());
+                fragments.erase(alignment.Qname());
+                return seq.length() > 0 ? gc(seq) : -1;
+            }
+        }
+        return -1;
     }
-    
+
     // Estimate fragment size in a read pair
-    unsigned int fragmentSizeMetrics(unsigned int doFragmentSize, map<chrom, list<Feature>> *bedFeatures, map<string, FragmentMateEntry> &fragments, map<long long, unsigned long> &fragmentSizes, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable)
+    void fragmentSizeMetrics(unsigned int &doFragmentSize, map<chrom, list<Feature>> *bedFeatures, map<string, FragmentMateEntry> &fragments, map<long long, unsigned long> &fragmentSizes, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable)
     {
         string chrName = sequenceTable[alignment.ChrID()].Name;
         chrom chr = chromosomeMap(chrName); //generate the chromosome shorthand referemce
@@ -502,7 +523,9 @@ namespace rnaseqc {
                 // 2) The mate must always be +. If both reads are on the - strand, there has been a mapping error or genomic inversion
                 // 3) The this read ends after the mate does. If this read is contained by the mate, there has been some weird clipping errors with the cDNA adapters
                 // 4) This read must not start at the same point as the mate. If so, without this check, the pair may be arbitrarily discarded or kept depending on sort order
-                if (alignment.MateReverseFlag() || !alignment.ReverseFlag() || alignment.PositionEnd() <= std::get<ENDPOS>(fragment->second)  || alignment.Position() == alignment.MatePosition()) return doFragmentSize;
+
+                //FIXME: Is the above test actually accurate? Cant a + read appear after a - read for reverse strand alignments?
+                if (alignment.MateReverseFlag() || !alignment.ReverseFlag() || alignment.PositionEnd() <= std::get<ENDPOS>(fragment->second)  || alignment.Position() == alignment.MatePosition()) return;
                 //This pair is useable for fragment statistics:  both pairs fully aligned to the same exon
                 fragmentSizes[abs(alignment.InsertSize())] += 1;
                 fragments.erase(fragment);
@@ -514,7 +537,41 @@ namespace rnaseqc {
                 }
             }
         }
-        //return the remaining count of fragment samples to take
-        return doFragmentSize;
     }
+
+
+    /*double gcContent(unsigned int &doFragmentSize, map<chrom, list<Feature>> *bedFeatures, map<string, FragmentMateEntry> &fragments, map<long long, unsigned long> &fragmentSizes, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable, Fasta &fastaReader)
+    {
+        string chrName = sequenceTable[alignment.ChrID()].Name;
+        chrom chr = chromosomeMap(chrName); //generate the chromosome shorthand referemce
+        bool firstBlock = true, sameExon = true; //for keeping track of the alignment state
+        string exonName = ""; // the name of the intersected exon from the bed
+        
+        trimFeatures(alignment, (*bedFeatures)[chr]); //trim out the features to speed up intersections
+        for (auto block = blocks.begin(); sameExon && block != blocks.end(); ++block)
+        {
+            //for each block, intersect it with the bed file features
+            list<Feature> *results = intersectBlock(*block, (*bedFeatures)[chr]);
+            if (results->size() == 1 && (partialIntersect(results->front(), *block) == (block->end - block->start))) //if the block intersected more than one exon, it's immediately disqualified
+            {
+                if (firstBlock) exonName = results->begin()->feature_id; //record the exon name on the first pass
+                else if (exonName != results->begin()->feature_id) //ensure the same exon name on subsequent passes
+                {
+                    sameExon = false;
+                    delete results;
+                    break;
+                }
+            }
+            else sameExon = false;
+            delete results; //clean up dynamic allocation
+            firstBlock = false;
+        }
+        if (sameExon && exonName.size()) //if all blocks intersected the same exon, take a fragment size sample
+        {
+            //both mates in a pair have to intersected the same exon in order for the pair to qualify for the sample
+            
+        }
+        //return the remaining count of fragment samples to take
+        return -1;
+    }*/
 }
diff --git a/src/Expression.h b/src/Expression.h
@@ -11,7 +11,6 @@
 
 #include "Metrics.h"
 #include "BamReader.h"
-#include <vector>
 #include <set>
 #include <iostream>
 
@@ -25,13 +24,13 @@ namespace rnaseqc {
     void dropFeatures(std::list<Feature>&, BaseCoverage&);
 
     // Definitions for fragment tracking
-    typedef std::tuple<std::string, coord> FragmentMateEntry; // Used to record mate end point
+    typedef std::tuple<std::string, coord> FragmentMateEntry; // Used to record mate end point (exon name, read end position)
     const std::size_t EXON = 0, ENDPOS = 1;
 
     //Metrics functions
-    unsigned int fragmentSizeMetrics(unsigned int, std::map<chrom, std::list<Feature>>*, std::map<std::string, FragmentMateEntry>&, std::map<long long, unsigned long>&,std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&);
+    void fragmentSizeMetrics(unsigned int&, std::map<chrom, std::list<Feature>>*, std::map<std::string, FragmentMateEntry>&, std::map<long long, unsigned long>&,std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&);
 
-    void exonAlignmentMetrics(std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool);
+    double exonAlignmentMetrics(std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool, std::map<std::string, FragmentMateEntry>&, Fasta&);
 
     void legacyExonAlignmentMetrics(unsigned int, std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool);