Skip to content

Commit

Permalink
Change raw MQ to a tuple of (sumSquaredMQs, totalDepth) for better ac…
Browse files Browse the repository at this point in the history
…curacy where there are lots of uninformative reads
  • Loading branch information
ldgauthier committed Aug 24, 2018
1 parent 401d217 commit c3c0ed5
Show file tree
Hide file tree
Showing 26 changed files with 36,298 additions and 623 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public final class GATKVCFConstants {
public static final String ALLELE_SPECIFIC_PREFIX = "AS_";
public static final String AS_FILTER_STATUS_KEY = "AS_FilterStatus";
public static final String RAW_RMS_MAPPING_QUALITY_KEY = "RAW_MQ";
public static final String RAW_MAPPING_QUALITY_WITH_DEPTH_KEY = "RAW_MQandDP";
public static final String AS_RMS_MAPPING_QUALITY_KEY = "AS_MQ";
public static final String AS_RAW_RMS_MAPPING_QUALITY_KEY = "AS_RAW_MQ";
public static final String AS_CULPRIT_KEY = "AS_culprit";
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.utils.variant;

import htsjdk.variant.vcf.*;
import org.broadinstitute.hellbender.tools.walkers.annotator.RMSMappingQuality;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.*;
Expand Down Expand Up @@ -125,7 +126,8 @@ private static void addFilterLine(final VCFFilterHeaderLine line) {
addInfoLine(new VCFInfoHeaderLine(LIKELIHOOD_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods"));
addInfoLine(new VCFInfoHeaderLine(MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"));
addInfoLine(new VCFInfoHeaderLine(AS_MAP_QUAL_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific Z-score From Wilcoxon rank sum test of each Alt vs. Ref read mapping qualities"));
addInfoLine(new VCFInfoHeaderLine(RAW_RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Raw data for RMS Mapping Quality"));
addInfoLine(new VCFInfoHeaderLine(RAW_RMS_MAPPING_QUALITY_KEY, 2, VCFHeaderLineType.Integer, "Raw data for RMS Mapping Quality (deprecated -- use " + RAW_MAPPING_QUALITY_WITH_DEPTH_KEY + " instead."));
addInfoLine(new VCFInfoHeaderLine(RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated " + RMSMappingQuality.getDeprecatedRawKeyName() + " formulation."));
addInfoLine(new VCFInfoHeaderLine(AS_RAW_RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.String, "Allele-specfic raw data for RMS Mapping Quality"));
addInfoLine(new VCFInfoHeaderLine(AS_RMS_MAPPING_QUALITY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific RMS Mapping Quality"));
addInfoLine(new VCFInfoHeaderLine(AS_RAW_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.String, "Allele-specfic raw data for Mapping Quality Rank Sum"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes
private static final String SAMPLE_NAME_KEY = "SN";
private static final String ANOTHER_ATTRIBUTE_KEY = "AA";

private static final List<String> GVCFS_WITH_NEW_MQ = Arrays.asList(toolsTestDir + "/haplotypecaller/expected.testGVCFMode.gatk4.g.vcf", getTestDataDir() + "/walkers/CombineGVCFs/YRIoffspring.chr20snippet.g.vcf");
private static final String COMBINED_WITH_NEW_MQ = toolsTestDir + "/walkers/CombineGVCFs/newMQcalc.combined.g.vcf";
private static final List<SimpleInterval> INTERVAL2 = Arrays.asList(new SimpleInterval("20", 1, 11_000_000));
private static final List<String> ATTRIBUTES_TO_IGNORE = Arrays.asList("RAW_MQ","RAW_MQandDP"); //CombineGVCFs doesn't support the old RAW_MQ anymore

@Override
public String getTestedClassName() {
return GenomicsDBImport.class.getSimpleName();
Expand All @@ -129,6 +134,11 @@ public void testGenomicsDBImportFileInputs() throws IOException {
testGenomicsDBImporter(LOCAL_GVCFS, INTERVAL, COMBINED, b38_reference_20_21, true);
}

@Test
public void testGenomicsDBImportFileInputs_newMQ() throws IOException {
testGenomicsDBImporter_newMQ(GVCFS_WITH_NEW_MQ, INTERVAL2, COMBINED_WITH_NEW_MQ, b37_reference_20_21, true, Collections.emptyList());
}

@Test
public void testGenomicsDBImportFileInputsWithMultipleIntervals() throws IOException {
testGenomicsDBImporter(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, b38_reference_20_21, true);
Expand Down Expand Up @@ -232,7 +242,7 @@ private void testGenomicsDBImporterWithGenotypes(final List<String> vcfInputs, f

writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, produceGTField, sitesOnlyQuery);
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE, produceGTField, sitesOnlyQuery);
}

private File runCombineGVCFs(final List<String> inputs, final List<SimpleInterval> intervals, final String reference, final String[] extraArgs) {
Expand Down Expand Up @@ -261,7 +271,7 @@ private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, fin
for(SimpleInterval currInterval : intervals) {
List<SimpleInterval> tmpList = new ArrayList<SimpleInterval>(Arrays.asList(currInterval));
File expectedCombinedVCF = runCombineGVCFs(vcfInputs, tmpList, referenceFile, CombineGVCFArgs);
checkGenomicsDBAgainstExpected(workspace, tmpList, expectedCombinedVCF.getAbsolutePath(), referenceFile, true);
checkGenomicsDBAgainstExpected(workspace, tmpList, expectedCombinedVCF.getAbsolutePath(), referenceFile, true, ATTRIBUTES_TO_IGNORE);
}
}

Expand All @@ -278,7 +288,7 @@ public void testGenomicsDBAbsolutePathDepndency() throws IOException {
writeToGenomicsDB(LOCAL_GVCFS, INTERVAL, workspace.getAbsolutePath() + "/workspace", 0, false, 0, 1);
checkJSONFilesAreWritten(workspace.getAbsolutePath() + "/workspace");
Files.move(workspace.toPath(), workspace2.toPath(), StandardCopyOption.REPLACE_EXISTING);
checkGenomicsDBAgainstExpected(workspace2.getAbsolutePath() + "/workspace", INTERVAL, COMBINED, b38_reference_20_21, true);
checkGenomicsDBAgainstExpected(workspace2.getAbsolutePath() + "/workspace", INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
}

@Test (enabled = true)
Expand Down Expand Up @@ -333,7 +343,7 @@ public void testDifferentThreadValuesFromABucket(final int threads) throws IOExc

writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, threads);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true);
checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
}

@Test(dataProvider = "getThreads")
Expand All @@ -343,7 +353,7 @@ public void testDifferentThreadValuesLocally(final int threads) throws IOExcepti

writeToGenomicsDB(vcfInputs, INTERVAL, workspace, 0, false, 0, threads);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true);
checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
}
/**
*
Expand All @@ -359,10 +369,24 @@ private void testGenomicsDBImporter(final List<String> vcfInputs, final List<Sim
final String expectedCombinedVCF, final String referenceFile,
final boolean testAll) throws IOException {
final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1);

checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, ATTRIBUTES_TO_IGNORE);
}

private void testGenomicsDBImporter_newMQ(final List<String> vcfInputs, final List<SimpleInterval> intervals,
final String expectedCombinedVCF, final String referenceFile,
final boolean testAll, final List<String> attributesToIgnore) throws IOException {
final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";

writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll);
//FIXME: copy in the extra files until Protobuf update in PR #4645 is ready
//this updated json specifies the combine operation for the new RAW_MQandDP key; behavior for old version is hard-coded into GDB
final String vidmapPath = toolsTestDir +"/walkers/GenotypeGVCFs/vidmap.updated.json";
Runtime.getRuntime().exec("cp " + new File(vidmapPath).getAbsolutePath() +" "+ workspace + "/vidmap.json");

checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, referenceFile, testAll, attributesToIgnore);
}

private void testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, final List<SimpleInterval> intervals,
Expand All @@ -371,7 +395,7 @@ private void testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, f

writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, false, 0, 1);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true);
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
}

private void testGenomicsDBImportWithZeroBufferSize(final List<String> vcfInputs, final List<SimpleInterval> intervals,
Expand All @@ -380,7 +404,7 @@ private void testGenomicsDBImportWithZeroBufferSize(final List<String> vcfInputs

writeToGenomicsDB(vcfInputs, intervals, workspace, 0, true, 0, 1);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true);
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);

}

Expand All @@ -406,17 +430,19 @@ private static void checkJSONFilesAreWritten(final String workspace) {

private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals,
final String expectedCombinedVCF, final String referenceFile,
final boolean testAll) throws IOException {
final boolean testAll, final List<String> attributesToIgnore) throws IOException {
checkGenomicsDBAgainstExpected(workspace, intervals,
expectedCombinedVCF, referenceFile,
testAll,
attributesToIgnore,
false,
false);
}

private static void checkGenomicsDBAgainstExpected(final String workspace, final List<SimpleInterval> intervals,
final String expectedCombinedVCF, final String referenceFile,
final boolean testAll,
final List<String> attributesToIgnore,
final boolean produceGTField,
final boolean sitesOnlyQuery) throws IOException {
final GenomicsDBFeatureReader<VariantContext, PositionalBufferedStream> genomicsDBFeatureReader =
Expand All @@ -442,7 +468,7 @@ private static void checkGenomicsDBAgainstExpected(final String workspace, final
.map(g -> g.getGenotypeString().equals(".")?new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(2)).make():g)
.collect(Collectors.toList());
a = new VariantContextBuilder(a).genotypes(genotypes).make();
VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, Collections.emptyList(), VCF_HEADER);
VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, attributesToIgnore, VCF_HEADER);

// Test only that the genotypes match
} else {
Expand Down Expand Up @@ -519,7 +545,7 @@ public void testSampleNameWithSpaces() throws IOException {

runCommandLine(args);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, SMALLER_INTERVAL, COMBINED_WITHSPACES, b38_reference_20_21, true);
checkGenomicsDBAgainstExpected(workspace, SMALLER_INTERVAL, COMBINED_WITHSPACES, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
}

@Test(dataProvider = "getOrderingTests")
Expand All @@ -531,7 +557,7 @@ public void testSampleNameOrdering(final ArgumentsBuilder args) throws IOExcepti

runCommandLine(args);
checkJSONFilesAreWritten(workspace);
checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true);
checkGenomicsDBAgainstExpected(workspace, INTERVAL, COMBINED, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE);
}

private static File createInOrderSampleMap() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@

public class CombineGVCFsIntegrationTest extends CommandLineProgramTest {
private static final List<String> NO_EXTRA_ARGS = Collections.emptyList();
private static final List<String> ATTRIBUTES_TO_IGNORE = Arrays.asList(
"RAW_MQ"); //MQ data format and key have changed since GATK3
private static final File NA12878_HG37 = new File(toolsTestDir + "haplotypecaller/expected.testGVCFMode.gatk4.g.vcf");


private static <T> void assertForEachElementInLists(final List<T> actual, final List<T> expected, final BiConsumer<T, T> assertion) {
Assert.assertEquals(actual.size(), expected.size(), "different number of elements in lists:\n"
Expand Down Expand Up @@ -76,6 +80,7 @@ public Object[][] gvcfsToCombine() {
getTestFile("gvcfWithTrailingReferenceBlocksBandedExpected.g.vcf"),
Arrays.asList("--" + CombineGVCFs.BREAK_BANDS_LONG_NAME, "2000000"),
b38_reference_20_21},
{new File[]{NA12878_HG37, getTestFile("YRIoffspring.chr20snippet.g.vcf")}, getTestFile("newMQcalc.combined.g.vcf"), NO_EXTRA_ARGS, b37_reference_20_21},
};
}

Expand Down Expand Up @@ -121,12 +126,12 @@ public void compareToGATK3(File[] inputs, File outputFile, List<String> extraArg
System.out.println("Found precomputed gatk3Result");
}

assertVariantContextsMatch(Arrays.asList(inputs), gatk3Result, extraArgs, reference);
assertVariantContextsMatch(Arrays.asList(inputs), gatk3Result, extraArgs, reference, ATTRIBUTES_TO_IGNORE);
}

@Test(dataProvider = "gvcfsToCombine")
public void compareToGATK3ExpectedResults(File[] inputs, File outputFile, List<String> extraArgs, String reference) throws IOException, NoSuchAlgorithmException {
assertVariantContextsMatch(Arrays.asList(inputs), outputFile, extraArgs, reference);
assertVariantContextsMatch(Arrays.asList(inputs), outputFile, extraArgs, reference, ATTRIBUTES_TO_IGNORE);
}

public static void runProcess(ProcessController processController, String[] command) {
Expand All @@ -138,15 +143,15 @@ public static void runProcess(ProcessController processController, String[] comm
}



private void assertVariantContextsMatch(List<File> inputs, File expected, List<String> extraArgs, String reference) throws IOException {
public void assertVariantContextsMatch(List<File> inputs, File expected, List<String> extraArgs, String reference, List<String> attributesToIgnore) throws IOException {
final VCFHeader header = getHeaderFromFile(expected);

runCombineGVCFSandAssertSomething(inputs, expected, extraArgs, (a, e) -> {
VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, Arrays.asList(), header);
VariantContextTestUtils.assertVariantContextsAreEqualAlleleOrderIndependent(a, e, attributesToIgnore, header);
}, reference);
}


public void runCombineGVCFSandAssertSomething(List<File> inputs, File expected, List<String> additionalArguments, BiConsumer<VariantContext, VariantContext> assertion, String reference) throws IOException {
final File output = createTempFile("combinegvcfs", ".vcf");

Expand Down
Loading

0 comments on commit c3c0ed5

Please sign in to comment.