Skip to content

Commit

Permalink
Fix and test for calculating CombineGVCFs intermediate band interval …
Browse files Browse the repository at this point in the history
…start locations.
  • Loading branch information
cmnbroad committed Apr 19, 2018
1 parent 7108d82 commit 666cef0
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,14 @@ void createIntermediateVariants(SimpleInterval intervalToClose) {

// Break up the GVCF according to the provided reference blocking scheme
if ( multipleAtWhichToBreakBands > 0) {
for (int i = ((intervalToClose.getStart())/multipleAtWhichToBreakBands)*multipleAtWhichToBreakBands; i <= intervalToClose.getEnd(); i+=multipleAtWhichToBreakBands) {
sitesToStop.add(i-1); // Subtract 1 here because we want to split before this base
// if the intermediate interval to close starts before the end of the first interval,
// create the first stop position at the end of the band multiple
for (int blockEndPosition = intervalToClose.getStart() < multipleAtWhichToBreakBands ?
multipleAtWhichToBreakBands :
(intervalToClose.getStart() / multipleAtWhichToBreakBands) * multipleAtWhichToBreakBands;
blockEndPosition <= intervalToClose.getEnd();
blockEndPosition += multipleAtWhichToBreakBands) {
sitesToStop.add(blockEndPosition - 1); // Subtract 1 here because we want to split before this base
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ public Object[][] gvcfsToCombine() {
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotationsNoGroup.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21},
//Test that trailing reference blocks are emitted with correct intervals
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")}, getTestFile("gvcfWithTrailingReferenceBlocksExpected.g.vcf"), NO_EXTRA_ARGS, b38_reference_20_21},
// same test as the previous one, except with a band multiple specified
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")},
getTestFile("gvcfWithTrailingReferenceBlocksBandedExpected.g.vcf"),
Arrays.asList("--" + CombineGVCFs.BREAK_BANDS_LONG_NAME, "2000000"),
b38_reference_20_21},
};
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineGVCFs,CommandLine="CombineGVCFs --output /var/folders/cr/16ghvyfj5lvfwxx01rt1k4tdl04sy3/T/combinegvcfs3475467159514601676.vcf --break-bands-at-multiples-of 2000000 --variant /Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs/gvcfExample1WithTrailingReferenceBlocks.g.vcf --variant /Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs/gvcfExample2WithTrailingReferenceBlocks.g.vcf --reference /Users/cnorman/projects/gatk/src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta --verbosity ERROR --annotation-group StandardAnnotation --disable-tool-default-annotations false --convert-to-base-pair-resolution false --ignore-variants-starting-outside-interval false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --help false --version false --showHidden false --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --disable-tool-default-read-filters false",Version=Unavailable,Date="April 19, 2018 9:53:59 AM EDT">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr20_GL383577v2_alt,length=128386>
##contig=<ID=chr20_KI270869v1_alt,length=118774>
##contig=<ID=chr20_KI270871v1_alt,length=58661>
##contig=<ID=chr20_KI270870v1_alt,length=183433>
##contig=<ID=chr21_GL383578v2_alt,length=63917>
##contig=<ID=chr21_KI270874v1_alt,length=166743>
##contig=<ID=chr21_KI270873v1_alt,length=143900>
##contig=<ID=chr21_GL383579v2_alt,length=201197>
##contig=<ID=chr21_GL383580v2_alt,length=74653>
##contig=<ID=chr21_GL383581v2_alt,length=116689>
##contig=<ID=chr21_KI270872v1_alt,length=82692>
##source=CombineGVCFs
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 NA2
chr20 69491 . G <NON_REF> . . END=69497 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.
chr20 69498 . C <NON_REF> . . END=69506 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.:94:99:82:99:0,120,1800
chr20 69507 . T <NON_REF> . . END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.
chr20 69511 . A G,<NON_REF> . . BaseQRankSum=1.17;DP=82;MQ=31.05;MQ0=0;MQRankSum=-8.660e-01;ReadPosRankSum=1.69 GT:AD:DP:GQ:PL:SB ./.:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33 ./.
chr20 69512 . C <NON_REF> . . END=69513 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:96:99:82:99:0,120,1800 ./.
chr20 69514 . T <NON_REF> . . END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:96:99:82:99:0,120,1800 ./.:96:99:82:99:0,120,1800
chr20 69522 . T <NON_REF> . . END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:95:0:95:0:0,0,0 ./.:96:99:82:99:0,120,1800
chr20 69549 . G <NON_REF> . . END=69624 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:156:99:56:66:0,66,990 ./.:96:99:82:99:0,120,1800
chr20 69625 . A <NON_REF> . . END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:156:99:56:66:0,66,990 ./.:7:18:7:18:0,18,270
chr20 69635 . G T,<NON_REF> . . BaseQRankSum=0.937;DP=14;MQ=34.15;MQ0=0;MQRankSum=1.30;ReadPosRankSum=1.75 GT:AD:DP:GQ:MIN_DP:MIN_GQ:PL:SB ./.:4,3,0:7:89:.:.:89,0,119,101,128,229:0,4,0,3 ./.:.:7:18:7:18:0,18,270,18,270,270
chr20 69636 . A <NON_REF> . . END=69675 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./. ./.:7:18:7:18:0,18,270
chr20 69762 . T <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:18:7:18:0,18,270 ./.
chr20 69763 . A <NON_REF> . . END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:21:7:21:0,21,253 ./.
chr20 69767 . C <NON_REF> . . END=69771 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:12:7:12:0,12,180 ./.:7:12:7:15:0,12,180
chr20 69772 . AAAGC A,<NON_REF> . . BaseQRankSum=0.937;DP=14;MQ=34.15;MQ0=0;MQRankSum=1.30;ReadPosRankSum=1.75 GT:AD:DP:GQ:MIN_DP:MIN_GQ:PL:SB ./.:4,3,0:7:89:.:.:89,0,119,101,128,229:0,4,0,3 ./.:.:7:12:7:15:0,12,180,12,180,180
chr20 69773 . A <NON_REF> . . END=69774 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:0:3:0:0,0,0 ./.:7:12:7:15:0,12,180
chr20 69775 . C <NON_REF> . . END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:0:3:0:0,0,0 ./.:7:15:7:15:0,15,160
chr20 69784 . A <NON_REF> . . END=69791 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./. ./.:7:15:7:15:0,15,160
chr21 1 . G <NON_REF> . . END=1999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 2000000 . N <NON_REF> . . END=3999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 4000000 . N <NON_REF> . . END=5999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 6000000 . C <NON_REF> . . END=7999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 8000000 . G <NON_REF> . . END=9999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 10000000 . C <NON_REF> . . END=11999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 12000000 . N <NON_REF> . . END=13999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 14000000 . T <NON_REF> . . END=15999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 16000000 . A <NON_REF> . . END=17999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 18000000 . G <NON_REF> . . END=19999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 20000000 . A <NON_REF> . . END=21999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 22000000 . T <NON_REF> . . END=23999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 24000000 . G <NON_REF> . . END=25999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 26000000 . G <NON_REF> . . END=27999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 28000000 . C <NON_REF> . . END=29999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 30000000 . G <NON_REF> . . END=31999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 32000000 . A <NON_REF> . . END=33999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 34000000 . T <NON_REF> . . END=35999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 36000000 . G <NON_REF> . . END=37999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 38000000 . A <NON_REF> . . END=39999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 40000000 . T <NON_REF> . . END=41999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 42000000 . G <NON_REF> . . END=43999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 44000000 . G <NON_REF> . . END=45999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 46000000 . G <NON_REF> . . END=46709983 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr20_GL383577v2_alt 1 . G <NON_REF> . . END=128386 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0

0 comments on commit 666cef0

Please sign in to comment.