Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing calculation of GC content to be correct. #4608

Merged
merged 1 commit into from
Mar 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,7 @@ private GencodeFuncotation createUtrFuncotation(final VariantContext variant,
}

// Set GC Content:
gencodeFuncotationBuilder.setGcContent( calculateGcContent( reference, gcContentWindowSizeBases ) );
gencodeFuncotationBuilder.setGcContent( calculateGcContent( variant.getReference(), altAllele, reference, gcContentWindowSizeBases ) );

// Get the strand:
final Strand strand = transcript.getGenomicStrand();
Expand Down Expand Up @@ -1158,7 +1158,7 @@ private GencodeFuncotation createIntronFuncotation(final VariantContext variant,
}

// Set GC Content:
gencodeFuncotationBuilder.setGcContent( calculateGcContent( reference, gcContentWindowSizeBases ) );
gencodeFuncotationBuilder.setGcContent( calculateGcContent( variant.getReference(), altAllele, reference, gcContentWindowSizeBases ) );

// Need to check if we're within the window for splice site variants:
final GencodeGtfExonFeature spliceSiteExon = getExonWithinSpliceSiteWindow(variant, transcript, spliceSiteVariantWindowBases);
Expand Down Expand Up @@ -1354,7 +1354,7 @@ static SequenceComparison createSequenceComparison(final VariantContext variant,
sequenceComparison.setReferenceBases(referenceBases);

// Set our GC content:
sequenceComparison.setGcContent(calculateGcContent(reference, gcContentWindowSizeBases));
sequenceComparison.setGcContent(calculateGcContent(variant.getReference(), altAllele, reference, gcContentWindowSizeBases));

// Get the ref allele:
sequenceComparison.setReferenceAllele(refAllele.getBaseString());
Expand Down Expand Up @@ -1491,21 +1491,47 @@ static SequenceComparison createSequenceComparison(final VariantContext variant,
/**
* Calculates the fraction of Guanine and Cytosine bases in a window of a given size around a variant.
* Note: Since Guanine and Cytosine are complementary bases, strandedness makes no difference.
* @param refAllele Reference {@link Allele} for the locus in question.
* @param altAllele Alternate {@link Allele} for the locus in question.
* @param referenceContext The {@link ReferenceContext} for a variant. Assumed to already be centered on the variant of interest. Must not be {@code null}.
* @param windowSize The number of bases to the left and right of the given {@code variant} to calculate the GC Content. Must be >=1.
* @return The fraction of Guanine and Cytosine bases / total bases in a window of size {@code windowSize} around a variant.
*/
public static double calculateGcContent( final ReferenceContext referenceContext,
public static double calculateGcContent( final Allele refAllele,
final Allele altAllele,
final ReferenceContext referenceContext,
final int windowSize ) {

Utils.nonNull( referenceContext );
ParamUtils.isPositive( windowSize, "Window size must be >= 1." );


final int leadingWindowSize;
final int trailingWindowSize = windowSize;

if ( GATKVariantContextUtils.isInsertion(refAllele, altAllele) ||
GATKVariantContextUtils.isDeletion(refAllele, altAllele)) {
// If we have an insertion, we take 1 less base from the front
// because the insertion happens between two codons.
// The preceding padding base is there as a convenience in VCF files.
// Thus the prior <windowSize> bases will contain this leading padding base.

// If we have a deletion, the convention in VCF files is to include a
// padding base at the front prior to the deleted bases so the alternate
// allele can be non-empty.
// Because of this we subtract 1 from the leading window size.

leadingWindowSize = windowSize - 1;
}
else {
leadingWindowSize = windowSize;
}

// Create a placeholder for the bases:
final byte[] bases;

// Get the bases:
bases = referenceContext.getBases(windowSize, windowSize);
bases = referenceContext.getBases(leadingWindowSize, trailingWindowSize);

// Get the gcCount:
long gcCount = 0;
Expand Down Expand Up @@ -1738,7 +1764,7 @@ private GencodeFuncotation createIgrFuncotation(final VariantContext variant,
final GencodeFuncotationBuilder funcotationBuilder = new GencodeFuncotationBuilder();

// Get GC Content:
funcotationBuilder.setGcContent( calculateGcContent( reference, gcContentWindowSizeBases ) );
funcotationBuilder.setGcContent( calculateGcContent( variant.getReference(), altAllele, reference, gcContentWindowSizeBases ) );

funcotationBuilder.setVariantClassification( GencodeFuncotation.VariantClassification.IGR )
.setRefAllele( variant.getReference() )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -786,27 +786,45 @@ Object[][] provideDataForTestCalculateGcContent() {
final String contig = "test";

return new Object[][] {
{ referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 1, 0.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 9, 0.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 10, 1.0/11.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig,100,100), 1, 1 },
{ referenceHelperForTestCalculateGcContent(seq, contig,100,100), 9, 1 },
{ referenceHelperForTestCalculateGcContent(seq, contig,100,100), 10, 10.0/11.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 1, 1.0/3.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 9, 9.0/19.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 10, 11.0/21.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 50, 50.0/100.0 },

{ referenceHelperForTestCalculateGcContent(seq, contig, 1, 5), 1, 0.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 1, 5), 9, 4.0 / 14.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig, 1, 5), 10, 5.0/15.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig,95,100), 1, 1 },
{ referenceHelperForTestCalculateGcContent(seq, contig,95,100), 9, 10.0/15.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig,95,100), 10, 10.0/16.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 1, 6.0/8.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 9, 10.0/24.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 10, 11.0/26.0 },
{ referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 50, 50.0/100.0 },
// MNPs:
{ Allele.create("A", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 1, 0.0 },
{ Allele.create("A", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 9, 0.0 },
{ Allele.create("A", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 10, 1.0/11.0 },
{ Allele.create("C", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig,100,100), 1, 1 },
{ Allele.create("C", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig,100,100), 9, 1 },
{ Allele.create("C", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig,100,100), 10, 10.0/11.0 },
{ Allele.create("T", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 1, 1.0/3.0 },
{ Allele.create("T", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 9, 9.0/19.0 },
{ Allele.create("T", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 10, 11.0/21.0 },
{ Allele.create("T", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 50, 50.0/100.0 },
{ Allele.create("AAAAA", true), Allele.create("GGGGG"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 5), 1, 0.0 },
{ Allele.create("AAAAA", true), Allele.create("GGGGG"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 5), 9, 4.0 / 14.0 },
{ Allele.create("AAAAA", true), Allele.create("GGGGG"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 5), 10, 5.0/15.0 },
{ Allele.create("GCCCCC", true), Allele.create("AGGGGG"), referenceHelperForTestCalculateGcContent(seq, contig,95,100), 1, 1 },
{ Allele.create("GCCCCC", true), Allele.create("AGGGGG"), referenceHelperForTestCalculateGcContent(seq, contig,95,100), 9, 10.0/15.0 },
{ Allele.create("GCCCCC", true), Allele.create("AGGGGG"), referenceHelperForTestCalculateGcContent(seq, contig,95,100), 10, 10.0/16.0 },
{ Allele.create("TGGGGG", true), Allele.create("AAAAAA"), referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 1, 6.0/8.0 },
{ Allele.create("TGGGGG", true), Allele.create("AAAAAA"), referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 9, 10.0/24.0 },
{ Allele.create("TGGGGG", true), Allele.create("AAAAAA"), referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 10, 11.0/26.0 },
{ Allele.create("TGGGGG", true), Allele.create("AAAAAA"), referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 50, 50.0/100.0 },

// Insertions:
{ Allele.create("A", true), Allele.create("AG"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 1, 0.0 },
{ Allele.create("A", true), Allele.create("AGG"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 9, 0.0 },
{ Allele.create("A", true), Allele.create("AGGG"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 1), 10, 1.0/11.0 },
{ Allele.create("C", true), Allele.create("CG"), referenceHelperForTestCalculateGcContent(seq, contig,100,100), 1, 1 },
{ Allele.create("C", true), Allele.create("CGG"), referenceHelperForTestCalculateGcContent(seq, contig,100,100), 9, 1 },
{ Allele.create("C", true), Allele.create("CGGG"), referenceHelperForTestCalculateGcContent(seq, contig,100,100), 10, 1 },
{ Allele.create("T", true), Allele.create("TG"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 1, 1.0/2.0 },
{ Allele.create("T", true), Allele.create("TGG"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 9, 9.0/18.0 },
{ Allele.create("T", true), Allele.create("TGGG"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 10, 10.0/20.0 },
{ Allele.create("T", true), Allele.create("TGGGG"), referenceHelperForTestCalculateGcContent(seq, contig, 50, 50), 49, 49.0/98.0 },

// Deletions:
{ Allele.create("AAAAA", true), Allele.create("A"), referenceHelperForTestCalculateGcContent(seq, contig, 1, 5), 10, 5.0/15.0 },
{ Allele.create("GCCCCC", true), Allele.create("G"), referenceHelperForTestCalculateGcContent(seq, contig,95,100), 10, 10.0/15.0 },
{ Allele.create("TGGGGG", true), Allele.create("T"), referenceHelperForTestCalculateGcContent(seq, contig,50, 55), 10, 10.0/25.0 },
{ Allele.create("TG", true), Allele.create("T"), referenceHelperForTestCalculateGcContent(seq, contig,50, 51), 10, 10.0/21.0 },
};
}

Expand Down Expand Up @@ -1172,9 +1190,11 @@ void testGencodeFuncotationComparatorUnitTest( final Comparator<GencodeFuncotati
}

@Test (dataProvider = "provideDataForTestCalculateGcContent")
void testCalculateGcContent(final ReferenceContext referenceContext,
void testCalculateGcContent(final Allele refAllele,
final Allele altAllele,
final ReferenceContext referenceContext,
final int windowSize,
final double expected) {
Assert.assertEquals( GencodeFuncotationFactory.calculateGcContent( referenceContext, windowSize ), expected, doubleEqualsEpsilon);
Assert.assertEquals( GencodeFuncotationFactory.calculateGcContent( refAllele, altAllele, referenceContext, windowSize ), expected, doubleEqualsEpsilon);
}
}