Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

M2 doesn't use very short stubs of clipped reads for genotyping #5057

Merged
merged 3 commits into from
Jul 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator {
public static final int MAX_NORMAL_QUAL_SUM = 100;
public static final int MIN_PALINDROME_SIZE = 5;

// after trimming to fit the assembly window, throw away read stubs shorter than this length
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add why you do this and cite the issue in github?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

// if we don't, the several bases left of reads that end just within the assembly window can
// get realigned incorrectly. See https://github.com/broadinstitute/gatk/issues/5060
public static final int MINIMUM_READ_LENGTH_AFTER_TRIMMING = 10;

private M2ArgumentCollection MTAC;
private SAMFileHeader header;

Expand Down Expand Up @@ -193,6 +198,9 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
}

final AssemblyRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
final List<GATKRead> readStubs = regionForGenotyping.getReads().stream()
.filter(r -> r.getLength() < MINIMUM_READ_LENGTH_AFTER_TRIMMING).collect(Collectors.toList());
regionForGenotyping.removeAll(readStubs);

final Map<String,List<GATKRead>> reads = splitReadsBySample( regionForGenotyping.getReads() );

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ public class Mutect2IntegrationTest extends CommandLineProgramTest {
private static final File NO_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/no-contamination.table");
private static final File FIVE_PCT_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/five-pct-contamination.table");
private static final File TEN_PCT_CONTAMINATION_TABLE = new File(toolsTestDir, "mutect/ten-pct-contamination.table");

private static final File NA12878_MITO_BAM = new File(toolsTestDir, "mutect/mito/NA12878.bam");
private static final File MITO_REF = new File(toolsTestDir, "mutect/mito/Homo_sapiens_assembly38.mt_only.fasta");

/**
* Several DREAM challenge bams with synthetic truth data. In order to keep file sizes manageable, bams are restricted
* to chromosome 20, leaving ~100-200 variants, and then further restricted to 400-bp intervals centered around
Expand Down Expand Up @@ -466,6 +470,35 @@ public void testMinBaseQualityScore() throws Exception {
}
}

// basic test on a small chunk of NA12878 mitochondria. This is not a validation, but rather a sanity check
// that M2 makes obvious calls, doesn't trip up on the beginning of the circular chromosome, and can handle high depth
@Test
public void testMitochondria() throws Exception {
Utils.resetRandomGenerator();
final File unfilteredVcf = createTempFile("unfiltered", ".vcf");

final List<String> args = Arrays.asList("-I", NA12878_MITO_BAM.getAbsolutePath(),
"-" + M2ArgumentCollection.TUMOR_SAMPLE_SHORT_NAME, "NA12878",
"-R", MITO_REF.getAbsolutePath(),
"-L", "chrM:1-1000",
"-min-pruning", "5",
"-O", unfilteredVcf.getAbsolutePath());
runCommandLine(args);


final List<VariantContext> variants = VariantContextTestUtils.streamVcf(unfilteredVcf).collect(Collectors.toList());
final Set<String> variantKeys = variants.stream().map(vc -> keyForVariant(vc)).collect(Collectors.toSet());

final List<String> expectedKeys = Arrays.asList(
"chrM:152-152 [T*, C]",
"chrM:263-263 [A*, G]",
"chrM:301-301 [A*, AC]",
"chrM:302-302 [A*, AC, C, ACC]",
"chrM:310-310 [T*, TC]",
"chrM:750-750 [A*, G]");
Assert.assertTrue(expectedKeys.stream().allMatch(variantKeys::contains));
}



@DataProvider(name="bamoutVariations")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
@HD VN:1.5
@SQ SN:chrM LN:16569 M5:c68f52674c9fb33aef52dcf399755519 UR:file:/humgen/gsa-hpprojects/dev/mshand/SpecOps/Mitochondria/MitochondriaOnlyFastas/Homo_sapiens_assembly38.mt_only.fasta
Loading