Skip to content

Commit

Permalink
Improve MQ calculation accuracy (broadinstitute#4969)
Browse files Browse the repository at this point in the history
Change raw MQ to a tuple of (sumSquaredMQs, totalDepth) for better accuracy where there are lots of uninformative reads or called single-sample variants with homRef genotypes.  Note that incorporating this change into a pipeline will require a concomitant update to this version for GenomicsDBImport and GenotypeGVCFs.
  • Loading branch information
ldgauthier authored and EdwardDixon committed Nov 9, 2018
1 parent ce5f699 commit e3aa4ee
Show file tree
Hide file tree
Showing 39 changed files with 50,129 additions and 816 deletions.
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package org.broadinstitute.hellbender.engine;

import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.*;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
Expand All @@ -19,25 +20,18 @@
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;

import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import com.googlecode.protobuf.format.JsonFormat;
import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;

import java.io.File;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.io.FileReader;
import java.util.HashMap;
import java.util.Map;



/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
Expand Down Expand Up @@ -68,11 +62,6 @@
public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);

/**
* identifies a path as a GenomicsDB URI
*/
public static final String GENOMIC_DB_URI_SCHEME = "gendb://";

/**
* Feature reader used to retrieve records from our file
*/
Expand Down Expand Up @@ -288,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
this.queryLookaheadBases = queryLookaheadBases;
}

/**
* @param path String containing the path to test
* @return true if path represent a GenomicsDB URI, otherwise false
*/
public static boolean isGenomicsDBPath(final String path) {
return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
}

@SuppressWarnings("unchecked")
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
Expand Down Expand Up @@ -368,7 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
}
}

private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
if (!isGenomicsDBPath(path)) {
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
}
Expand Down Expand Up @@ -404,149 +385,6 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
}
}

private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
final File callsetJson, final File vidmapJson,
final File vcfHeader) {
final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
.setWorkspace(workspace.getAbsolutePath())
.setReferenceGenome(reference.getAbsolutePath())
.setVidMappingFile(vidmapJson.getAbsolutePath())
.setCallsetMappingFile(callsetJson.getAbsolutePath())
.setVcfHeaderFilename(vcfHeader.getAbsolutePath())
.setProduceGTField(false)
.setProduceGTWithMinPLValueForSpanningDeletions(false)
.setSitesOnlyQuery(false)
.setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();

// For the multi-interval support, we create multiple arrays (directories) in a single workspace -
// one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
// you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
// partition bounds.

// During the read phase, the user only supplies the workspace. The array names are obtained by scanning
// the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
// 50, 50M), then only the second array is queried.

// In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
// will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
// the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
// directory entries.
if (Files.exists(arrayFolder)) {
exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
} else {
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
}

//Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf
//API
//
//References
//GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
//Protobuf generated Java code guide:
//https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
//https://developers.google.com/protocol-buffers/docs/reference/java-generated

//Parse the vid json and create an in-memory Protobuf structure representing the
//information in the JSON file
GenomicsDBVidMapProto.VidMappingPB vidMapPB;
try {
vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
} catch (final IOException e) {
throw new UserException("Could not open vid json file " + vidmapJson, e);
}

//In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
//Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
//We iterate over the list and create a field name to list index map
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);

//Example: set MQ combine operation to median (default is also median, but this is just an example)
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
"MQ", "median");
if (vidMapPB != null) {
//Use rebuilt vidMap in exportConfiguration
//NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
//C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
//in the JSON file
exportConfigurationBuilder.setVidMapping(vidMapPB);
}

return exportConfigurationBuilder.build();
}

/**
* Parse the vid json and create an in-memory Protobuf structure representing the
* information in the JSON file
*
* @param vidmapJson vid JSON file
* @return Protobuf object
*/
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
throws IOException {
final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
try (final FileReader reader = new FileReader(vidmapJson)) {
JsonFormat.merge(reader, vidMapBuilder);
}
return vidMapBuilder.build();
}

/**
* In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
* Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
* We iterate over the list and create a field name to list index map
*
* @param vidMapPB Protobuf vid mapping object
* @return map from field name to index in vidMapPB.fields list
*/
public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>();
for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) {
fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
}
return fieldNameToIndexInVidFieldsList;
}

/**
* Update vid Protobuf object with new combine operation for field
*
* @param vidMapPB input vid object
* @param fieldNameToIndexInVidFieldsList name to index in list
* @param fieldName INFO field name
* @param newCombineOperation combine op ("sum", "median")
* @return updated vid Protobuf object if field exists, else null
*/
public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
final Map<String, Integer> fieldNameToIndexInVidFieldsList,
final String fieldName,
final String newCombineOperation) {
final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
if (fieldIdx >= 0) {
//Would need to rebuild vidMapPB - so get top level builder first
final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
//To update the list element corresponding to fieldName, we get the builder for that specific list element
final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
//And update its combine operation
fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);

//Shorter way of writing the same operation
/*
updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
.setVCFFieldCombineOperation(newCombineOperation);
*/

//Rebuild full vidMap
return updatedVidMapBuilder.build();
}
return null;
}

/**
* Returns the sequence dictionary for this source of Features.
* Uses the dictionary from the VCF header (if present) for variant inputs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;

Expand Down Expand Up @@ -244,8 +245,8 @@ public void setFeatureCodecClass(final Class<FeatureCodec<T, ?>> featureCodecCla
* creates a name from the given filePath by finding the absolute path of the given input
*/
private static String makeIntoAbsolutePath(final String filePath){
if(FeatureDataSource.isGenomicsDBPath(filePath)){
return FeatureDataSource.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(FeatureDataSource.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
if(GenomicsDBUtils.isGenomicsDBPath(filePath)){
return GenomicsDBUtils.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(GenomicsDBUtils.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
} else if (URI.create(filePath).getScheme() != null) {
return IOUtils.getPath(filePath).toAbsolutePath().toUri().toString();
} else {
Expand Down
Loading

0 comments on commit e3aa4ee

Please sign in to comment.