Skip to content

Commit

Permalink
Merge pull request #56 from getzlab/cram-fixes
Browse files Browse the repository at this point in the history
Cram fixes
  • Loading branch information
agraubert authored Mar 4, 2021
2 parents a458d7c + 254e2ae commit d34c139
Show file tree
Hide file tree
Showing 14 changed files with 403 additions and 139 deletions.
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ clean:

.PHONY: test

test: test-version test-single test-chr1 test-downsampled test-legacy test-expected-failures
test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures
echo Tests Complete

.PHONY: test-version
Expand Down Expand Up @@ -112,12 +112,14 @@ test-legacy: rnaseqc
.PHONY: test-crams

test-crams: rnaseqc
touch test_data/chr1.fasta.fai
./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta
python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.cram chr1.bam
python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_
python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct test_data/chr1.output/chr1.bam.gene_reads.gct -m tables -c Counts Counts_
python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct test_data/chr1.output/chr1.bam.gene_tpm.gct -m tables -c TPM TPM_
python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct test_data/chr1.output/chr1.bam.exon_reads.gct -m tables -c Counts Counts_
python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct test_data/chr1.output/chr1.bam.gene_fragments.gct -m tables -c Fragments Fragments_
python3 test_data/approx_diff.py .test_output/chr1.cram.gc_content.tsv test_data/chr1.output/chr1.cram.gc_content.tsv -m metrics -c Count Count_
sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv
python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
Expand Down
2 changes: 1 addition & 1 deletion python/rnaseqc/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.2'
__version__ = '0.0.3'
68 changes: 33 additions & 35 deletions python/rnaseqc/legacy_exon_remap.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,45 +19,43 @@ def run(args):
writer.writeheader()
current = None
features = []
with status_bar(numRows) as bar:
for line in reader:
bar.update(bar.current + 1)
gene = '_'.join(line['Name'].split('_')[:-1])
if gene != current:
if current is not None:
ref = gtf.get_gene(current)
for line in status_bar.iter(reader, maximum=numRows):
gene = '_'.join(line['Name'].split('_')[:-1])
if gene != current:
if current is not None:
ref = gtf.get_gene(current)
try:
if len(ref):
ref = ref[0]
except:
pass
exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons}
raw_size = len(exons)
for exon in [exon for exon in exons]:
try:
if len(ref):
ref = ref[0]
if exon.isdigit() and int(exon) <= raw_size:
exons[current+'_'+exon] = exons[exon]
except:
pass
exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons}
raw_size = len(exons)
for exon in [exon for exon in exons]:
try:
if exon.isdigit() and int(exon) <= raw_size:
exons[current+'_'+exon] = exons[exon]
except:
pass
features.sort(
key=lambda feat:(
1 if exons[feat['Name']].length == 1 else 0,
exons[feat['Name']].start_pos,
exons[feat['Name']].end_pos
)
features.sort(
key=lambda feat:(
1 if exons[feat['Name']].length == 1 else 0,
exons[feat['Name']].start_pos,
exons[feat['Name']].end_pos
)
for i in range(len(features)):
parts = features[i]['Name'].split('_')
prefix = '_'.join(parts[:-1])
suffix = parts[-1]
if exons[features[i]['Name']].length == 1:
features[i][reader.fieldnames[-1]] = 0
suffix = str(i)
features[i]['Name'] = prefix+'_'+suffix
writer.writerows(features)
current = gene
features = []
features.append({k:v for k,v in line.items()})
)
for i in range(len(features)):
parts = features[i]['Name'].split('_')
prefix = '_'.join(parts[:-1])
suffix = parts[-1]
if exons[features[i]['Name']].length == 1:
features[i][reader.fieldnames[-1]] = 0
suffix = str(i)
features[i]['Name'] = prefix+'_'+suffix
writer.writerows(features)
current = gene
features = []
features.append({k:v for k,v in line.items()})
if len(features):
ref = gtf.get_gene(current)
try:
Expand Down
14 changes: 11 additions & 3 deletions src/BamReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,16 @@ namespace rnaseqc {
{
// Must uncomment before adding multithreading
// std::lock_guard<SeqlibReader> guard(*this);
bool ok = this->bam.GetNextRecord(read);
if (ok) this->read_count++;
return ok;
try {
bool ok = this->bam.GetNextRecord(read);
if (ok) this->read_count++;
return ok;
}
catch (std::runtime_error &e) {
if (this->user_cram_reference) throw referenceHTSMismatch(std::string("HTSLib was unable to find a suitable reference while decoding a cram: ")+e.what());
throw;
}
return false; // No way to get here

}
}
43 changes: 40 additions & 3 deletions src/BamReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,24 @@
#ifndef BamReader_h
#define BamReader_h

#include "Fasta.h"
#include <stdio.h>
#include <mutex>
#include <string>
#include <set>
#include <SeqLib/BamReader.h>
#include <SeqLib/BamHeader.h>
#include <SeqLib/BamRecord.h>
#include <htslib/cram/cram.h> // I really don't like using unofficial APIs, but not much choice here.

namespace rnaseqc {

struct referenceHTSMismatch : public std::exception {
std::string error;
referenceHTSMismatch(std::string msg) : error(msg) {};
};


class SynchronizedReader {
std::mutex mtx;
protected:
Expand Down Expand Up @@ -44,9 +54,12 @@ namespace rnaseqc {

class SeqlibReader : public SynchronizedReader {
SeqLib::BamReader bam;
std::string reference_path;
std::set<chrom> valid_chroms;
bool user_cram_reference;
public:
SeqlibReader() : bam() {
}

SeqlibReader() : reference_path(), valid_chroms(), user_cram_reference(false) {}

bool next(SeqLib::BamRecord&);

Expand All @@ -55,12 +68,36 @@ namespace rnaseqc {
}

bool open(std::string filepath) {
if (this->reference_path.length()) {
auto htsfile = hts_open(filepath.c_str(), "r");
hts_set_fai_filename(htsfile, this->reference_path.c_str());
if (htsfile->format.format == htsExactFormat::cram) {
this->user_cram_reference = true;
// Cram handling is very dumb. All of this nonsense is just because htslib is incredibly opaque about reference handling
// Even with a user-provided reference, htslib only uses it if the MD5 matches
// So here we load up the file, check if it's a cram, then get a list of chromosomes that htslib decides to use
cram_fd *cram = static_cast<cram_fd*>(htsfile->fp.cram);
if (cram->refs && cram->refs->nref > 0)
for (unsigned int i = 0; i < cram->refs->nref; ++i)
if (this->reference_path == std::string(cram->refs->ref_id[i]->fn))
this->valid_chroms.insert(
chromosomeMap(cram->refs->ref_id[i]->name)
);
this->bam.SetCramReference(this->reference_path); // Consider moving out of if statement, if there's any meaningful use to having a reference set on a non-cram
}
hts_close(htsfile);
}
this->bam.Open(filepath);
return this->bam.IsOpen();
}

void addReference(std::string filepath) {
this->bam.SetCramReference(filepath);
this->reference_path = filepath;
}

inline bool validateChromosome(const chrom c) {
// For crams, we only validate chromosomes which matched our reference. Otherwise yes!
return this->user_cram_reference ? this->valid_chroms.count(c) > 0 : true;
}

};
Expand Down
71 changes: 64 additions & 7 deletions src/Expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ namespace rnaseqc {

// New version of exon metrics
// More efficient and less buggy
void exonAlignmentMetrics(map<chrom, list<Feature>> &features, Metrics &counter, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable, unsigned int length, Strand orientation, BaseCoverage &baseCoverage, const bool highQuality, const bool singleEnd)
double exonAlignmentMetrics(map<chrom, list<Feature>> &features, Metrics &counter, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable, unsigned int length, Strand orientation, BaseCoverage &baseCoverage, const bool highQuality, const bool singleEnd, map<string, FragmentMateEntry> &fragments, Fasta &fastaReader)
{
string chrName = sequenceTable[alignment.ChrID()].Name;
chrom chr = chromosomeMap(chrName); //generate the chromosome shorthand name
Expand All @@ -316,6 +316,7 @@ namespace rnaseqc {
current.end = alignment.PositionEnd(); //0-based, open == 1-based, closed

vector<set<string> > genes; //each set is the set of genes intersected by the current block (one set per block)
set<string> alignedExons; // Record of all aligned exons (make sure all blocks align to same exon for gc content)
Collector exonCoverageCollector(&exonCounts); //Collects coverage counts for later (counts may be discarded)
bool intragenic = false, transcriptPlus = false, transcriptMinus = false, ribosomal = false, doExonMetrics = false, exonic = false; //various booleans for keeping track of the alignment

Expand Down Expand Up @@ -344,7 +345,7 @@ namespace rnaseqc {
double tmp = static_cast<double>(intersectionSize) / length;
exonCoverageCollector.add(result->gene_id, result->feature_id, tmp);
baseCoverage.add(*result, block->start, block->end); //provisionally add per-base coverage to this gene

alignedExons.insert(result->feature_id);
}

}
Expand Down Expand Up @@ -455,10 +456,30 @@ namespace rnaseqc {
}
}
baseCoverage.reset();
if (fastaReader.hasContig(chr) && highQuality && exonic && doExonMetrics && alignedExons.size() == 1 && blocks.size() == 1 && fabs(alignment.InsertSize()) > 100 && fabs(alignment.InsertSize()) < 1000) {
string exonName = *(alignedExons.begin());
auto fragment = fragments.find(alignment.Qname());
if (fragment == fragments.end()) //first time we've encountered a read in this pair
{
// Record the exon we aligned to and the actual end of the read
fragments[alignment.Qname()] = std::make_tuple(exonName, alignment.PositionEnd());
}
else if (exonName == std::get<EXON>(fragment->second)) //second time we've encountered a read in this pair
{

//Check that we end after the mate ends, and that we aren't aligned to the same start position
if (alignment.PositionEnd() <= std::get<ENDPOS>(fragment->second) || alignment.Position() == alignment.MatePosition()) return -1;
//This pair is useable for fragment statistics: both pairs fully aligned to the same exon
string seq = fastaReader.getSeq(chr, std::get<ENDPOS>(fragment->second) - alignment.Length(), alignment.PositionEnd());
fragments.erase(alignment.Qname());
return seq.length() > 0 ? gc(seq) : -1;
}
}
return -1;
}

// Estimate fragment size in a read pair
unsigned int fragmentSizeMetrics(unsigned int doFragmentSize, map<chrom, list<Feature>> *bedFeatures, map<string, FragmentMateEntry> &fragments, map<long long, unsigned long> &fragmentSizes, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable)
void fragmentSizeMetrics(unsigned int &doFragmentSize, map<chrom, list<Feature>> *bedFeatures, map<string, FragmentMateEntry> &fragments, map<long long, unsigned long> &fragmentSizes, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable)
{
string chrName = sequenceTable[alignment.ChrID()].Name;
chrom chr = chromosomeMap(chrName); //generate the chromosome shorthand referemce
Expand Down Expand Up @@ -502,7 +523,9 @@ namespace rnaseqc {
// 2) The mate must always be +. If both reads are on the - strand, there has been a mapping error or genomic inversion
// 3) The this read ends after the mate does. If this read is contained by the mate, there has been some weird clipping errors with the cDNA adapters
// 4) This read must not start at the same point as the mate. If so, without this check, the pair may be arbitrarily discarded or kept depending on sort order
if (alignment.MateReverseFlag() || !alignment.ReverseFlag() || alignment.PositionEnd() <= std::get<ENDPOS>(fragment->second) || alignment.Position() == alignment.MatePosition()) return doFragmentSize;

//FIXME: Is the above test actually accurate? Cant a + read appear after a - read for reverse strand alignments?
if (alignment.MateReverseFlag() || !alignment.ReverseFlag() || alignment.PositionEnd() <= std::get<ENDPOS>(fragment->second) || alignment.Position() == alignment.MatePosition()) return;
//This pair is useable for fragment statistics: both pairs fully aligned to the same exon
fragmentSizes[abs(alignment.InsertSize())] += 1;
fragments.erase(fragment);
Expand All @@ -514,7 +537,41 @@ namespace rnaseqc {
}
}
}
//return the remaining count of fragment samples to take
return doFragmentSize;
}


/*double gcContent(unsigned int &doFragmentSize, map<chrom, list<Feature>> *bedFeatures, map<string, FragmentMateEntry> &fragments, map<long long, unsigned long> &fragmentSizes, vector<Feature> &blocks, Alignment &alignment, SeqLib::HeaderSequenceVector &sequenceTable, Fasta &fastaReader)
{
string chrName = sequenceTable[alignment.ChrID()].Name;
chrom chr = chromosomeMap(chrName); //generate the chromosome shorthand referemce
bool firstBlock = true, sameExon = true; //for keeping track of the alignment state
string exonName = ""; // the name of the intersected exon from the bed
trimFeatures(alignment, (*bedFeatures)[chr]); //trim out the features to speed up intersections
for (auto block = blocks.begin(); sameExon && block != blocks.end(); ++block)
{
//for each block, intersect it with the bed file features
list<Feature> *results = intersectBlock(*block, (*bedFeatures)[chr]);
if (results->size() == 1 && (partialIntersect(results->front(), *block) == (block->end - block->start))) //if the block intersected more than one exon, it's immediately disqualified
{
if (firstBlock) exonName = results->begin()->feature_id; //record the exon name on the first pass
else if (exonName != results->begin()->feature_id) //ensure the same exon name on subsequent passes
{
sameExon = false;
delete results;
break;
}
}
else sameExon = false;
delete results; //clean up dynamic allocation
firstBlock = false;
}
if (sameExon && exonName.size()) //if all blocks intersected the same exon, take a fragment size sample
{
//both mates in a pair have to intersected the same exon in order for the pair to qualify for the sample
}
//return the remaining count of fragment samples to take
return -1;
}*/
}
7 changes: 3 additions & 4 deletions src/Expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

#include "Metrics.h"
#include "BamReader.h"
#include <vector>
#include <set>
#include <iostream>

Expand All @@ -25,13 +24,13 @@ namespace rnaseqc {
void dropFeatures(std::list<Feature>&, BaseCoverage&);

// Definitions for fragment tracking
typedef std::tuple<std::string, coord> FragmentMateEntry; // Used to record mate end point
typedef std::tuple<std::string, coord> FragmentMateEntry; // Used to record mate end point (exon name, read end position)
const std::size_t EXON = 0, ENDPOS = 1;

//Metrics functions
unsigned int fragmentSizeMetrics(unsigned int, std::map<chrom, std::list<Feature>>*, std::map<std::string, FragmentMateEntry>&, std::map<long long, unsigned long>&,std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&);
void fragmentSizeMetrics(unsigned int&, std::map<chrom, std::list<Feature>>*, std::map<std::string, FragmentMateEntry>&, std::map<long long, unsigned long>&,std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&);

void exonAlignmentMetrics(std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool);
double exonAlignmentMetrics(std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool, std::map<std::string, FragmentMateEntry>&, Fasta&);

void legacyExonAlignmentMetrics(unsigned int, std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool);

Expand Down
Loading

0 comments on commit d34c139

Please sign in to comment.