Skip to content

Commit

Permalink
Funcotator: add 5'/3' flank support, with tests (#5403)
Browse files Browse the repository at this point in the history
* Added support for annotating 5'/3' flanks via new FIVE_PRIME_FLANK and THREE_PRIME_FLANK funcotations.

* Added --five-prime-flank-size and --three-prime-flank-size arguments to control the size of each flanking region.

* Refactored datasource classes to allow for padded/custom queries to make this feature possible.

* We now emit IGR funcotations in more cases (in particular, when a gene has no basic transcripts, and when the basic transcripts do not fully span a gene and the flank size is small).

* Added comprehensive unit tests, and updated integration test data.

Resolves #4771
  • Loading branch information
droazen authored Nov 15, 2018
1 parent 20fcc33 commit 4f36f93
Show file tree
Hide file tree
Showing 22 changed files with 1,799 additions and 944 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,7 @@ public List<Funcotation> createFuncotations(final VariantContext variant, final
Utils.nonNull(featureContext);

// Query this funcotation factory to get the list of overlapping features.
@SuppressWarnings("unchecked")
final List<Feature> featureList = (List<Feature>) featureContext.getValues(mainSourceFileAsFeatureInput);
final List<Feature> featureList = queryFeaturesFromFeatureContext(featureContext);

final List<Funcotation> outputFuncotations;

Expand Down Expand Up @@ -210,6 +209,20 @@ private boolean isFeatureListCompatible(final List<Feature> featureList) {
return foundCompatibleFeature;
}

/**
* Queries the provided FeatureContext for Features from our FeatureInput {@link #mainSourceFileAsFeatureInput}.
* The default implementation returns all Features from our FeatureInput that overlap the FeatureContext's
* interval, but subclasses may override (for example, to pad the query).
*
* @param featureContext the FeatureContext to query
* @return Features from our FeatureInput {@link #mainSourceFileAsFeatureInput} queried from the FeatureContext
*/
protected List<Feature> queryFeaturesFromFeatureContext(final FeatureContext featureContext) {
@SuppressWarnings("unchecked")
final List<Feature> features = (List<Feature>)featureContext.getValues(mainSourceFileAsFeatureInput);
return features;
}

/**
* Creates a {@link List} of {@link Funcotation} for the given {@code variant} and {@code referenceContext}.
* These will be default funcotations that essentially have empty values.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package org.broadinstitute.hellbender.tools.funcotator;

/**
* Simple struct container class for the 5'/3' flank settings. Helps prevent the two values from
* getting swapped accidentally when passing them around.
*/
public final class FlankSettings {

public final int fivePrimeFlankSize;
public final int threePrimeFlankSize;

public FlankSettings(final int fivePrimeFlankSize, final int threePrimeFlankSize) {
this.fivePrimeFlankSize = fivePrimeFlankSize;
this.threePrimeFlankSize = threePrimeFlankSize;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ public void onTraversalStart() {
funcotatorArgs.transcriptSelectionMode,
finalUserTranscriptIdSet,
this,
funcotatorArgs.lookaheadFeatureCachingInBp
funcotatorArgs.lookaheadFeatureCachingInBp,
new FlankSettings(funcotatorArgs.fivePrimeFlankSize, funcotatorArgs.threePrimeFlankSize)
);

// Create our engine to do our work and drive this Funcotation train!
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,20 @@ public class FuncotatorArgumentCollection implements Serializable {
)
public List<String> annotationOverrides = new ArrayList<>();

@Argument(
fullName = FuncotatorArgumentDefinitions.FIVE_PRIME_FLANK_SIZE_NAME,
optional = true,
doc = "Variants within this many bases of the 5' end of a transcript (and not overlapping any part of the transcript itself) will be annotated as being in the 5' flanking region of that transcript"
)
public int fivePrimeFlankSize = FuncotatorArgumentDefinitions.FIVE_PRIME_FLANK_SIZE_DEFAULT_VALUE;

@Argument(
fullName = FuncotatorArgumentDefinitions.THREE_PRIME_FLANK_SIZE_NAME,
optional = true,
doc = "Variants within this many bases of the 3' end of a transcript (and not overlapping any part of the transcript itself) will be annotated as being in the 3' flanking region of that transcript"
)
public int threePrimeFlankSize = FuncotatorArgumentDefinitions.THREE_PRIME_FLANK_SIZE_DEFAULT_VALUE;

@Argument(
fullName = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_NAME,
optional = true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ public class FuncotatorArgumentDefinitions {
public static final String HG19_REFERENCE_VERSION_STRING = "hg19";
public static final String HG38_REFERENCE_VERSION_STRING = "hg38";

public static final String FIVE_PRIME_FLANK_SIZE_NAME = "five-prime-flank-size";
public static final String THREE_PRIME_FLANK_SIZE_NAME = "three-prime-flank-size";
public static final int FIVE_PRIME_FLANK_SIZE_DEFAULT_VALUE = 5000;
public static final int THREE_PRIME_FLANK_SIZE_DEFAULT_VALUE = 0;

public static final String LOOKAHEAD_CACHE_IN_BP_NAME = "lookahead-cache-bp";
public static final int LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE = VariantWalkerBase.FEATURE_CACHE_LOOKAHEAD;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.Funcotator;
import org.broadinstitute.hellbender.tools.funcotator.FuncotatorArgumentDefinitions;
import org.broadinstitute.hellbender.tools.funcotator.TranscriptSelectionMode;
import org.broadinstitute.hellbender.tools.funcotator.*;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.cosmic.CosmicFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.vcf.VcfFuncotationFactory;
Expand Down Expand Up @@ -231,19 +228,22 @@ public static boolean isValidDirectory(final Path p) {
* @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}.
* @param gatkToolInstance Instance of the {@link GATKTool} into which to add {@link FeatureInput}s. Must not be {@code null}.
* @param lookaheadFeatureCachingInBp Number of base-pairs to cache when querying variants.
* @param flankSettings Settings object containing our 5'/3' flank sizes
* @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information.
*/
public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFactoriesForDataSources(final Map<Path, Properties> dataSourceMetaData,
final LinkedHashMap<String, String> annotationOverridesMap,
final TranscriptSelectionMode transcriptSelectionMode,
final Set<String> userTranscriptIdSet,
final GATKTool gatkToolInstance,
final int lookaheadFeatureCachingInBp) {
final int lookaheadFeatureCachingInBp,
final FlankSettings flankSettings) {
Utils.nonNull(dataSourceMetaData);
Utils.nonNull(annotationOverridesMap);
Utils.nonNull(transcriptSelectionMode);
Utils.nonNull(userTranscriptIdSet);
Utils.nonNull(gatkToolInstance);
Utils.nonNull(flankSettings);

final List<DataSourceFuncotationFactory> dataSourceFactories = new ArrayList<>(dataSourceMetaData.size());

Expand Down Expand Up @@ -275,82 +275,15 @@ public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFact
break;
case GENCODE:
featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, GencodeGtfFeature.class);
funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet, featureInput);
funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode,
userTranscriptIdSet, featureInput, flankSettings);
break;
case VCF:
featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, VariantContext.class);
funcotationFactory = DataSourceUtils.createVcfDataSource(path, properties, annotationOverridesMap, featureInput);
break;
default:
throw new GATKException("Unknown type of DataSourceFuncotationFactory encountered: " + stringType );
}

// Add in our factory:
dataSourceFactories.add(funcotationFactory);
}

logger.debug("All Data Sources have been created.");
return dataSourceFactories;
}

/**
* Create a {@link List} of {@link DataSourceFuncotationFactory} based on meta data on the data sources, overrides, and transcript reporting priority information.
* THIS METHOD IS FOR TESTING ONLY!
* @param dataSourceMetaData {@link Map} of {@link Path}->{@link Properties} containing metadata about each data source. Must not be {@code null}.
* @param annotationOverridesMap {@link LinkedHashMap} of {@link String}->{@link String} containing any annotation overrides to include in data sources. Must not be {@code null}.
* @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}.
* @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}.
* @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information.
*/
@VisibleForTesting
public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFactoriesForDataSourcesForTesting(
final Map<Path, Properties> dataSourceMetaData,
final LinkedHashMap<String, String> annotationOverridesMap,
final TranscriptSelectionMode transcriptSelectionMode,
final Set<String> userTranscriptIdSet) {
Utils.nonNull(dataSourceMetaData);
Utils.nonNull(annotationOverridesMap);
Utils.nonNull(transcriptSelectionMode);
Utils.nonNull(userTranscriptIdSet);

final List<DataSourceFuncotationFactory> dataSourceFactories = new ArrayList<>(dataSourceMetaData.size());

// Now we know we have unique and valid data.
// Now we must instantiate our data sources:
for ( final Map.Entry<Path, Properties> entry : dataSourceMetaData.entrySet() ) {

final String funcotationFactoryName = entry.getValue().getProperty(CONFIG_FILE_FIELD_NAME_NAME);
logger.debug("Creating Funcotation Factory for " + funcotationFactoryName + " ...");

final Path path = entry.getKey();
final Properties properties = entry.getValue();

final DataSourceFuncotationFactory funcotationFactory;

// Note: we need no default case since we know these are valid:
final String stringType = properties.getProperty("type");
final FeatureInput<? extends Feature> featureInput;
switch ( FuncotatorArgumentDefinitions.DataSourceType.getEnum(stringType) ) {
case LOCATABLE_XSV:
featureInput = createFeatureInputsForTesting(path, properties);
funcotationFactory = DataSourceUtils.createLocatableXsvDataSource(path, properties, annotationOverridesMap, featureInput);
break;
case SIMPLE_XSV:
funcotationFactory = DataSourceUtils.createSimpleXsvDataSource(path, properties, annotationOverridesMap);
break;
case COSMIC:
funcotationFactory = DataSourceUtils.createCosmicDataSource(path, properties, annotationOverridesMap);
break;
case GENCODE:
featureInput = createFeatureInputsForTesting(path, properties);
funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet, featureInput);
break;
case VCF:
featureInput = createFeatureInputsForTesting(path, properties);
funcotationFactory = DataSourceUtils.createVcfDataSource(path, properties, annotationOverridesMap, featureInput);
break;
default:
throw new GATKException("Unknown type of DataSourceFuncotationFactory encountered: " + stringType );
throw new GATKException("Unknown type of DataSourceFuncotationFactory encountered: " + stringType);
}

// Add in our factory:
Expand Down Expand Up @@ -498,20 +431,23 @@ private static CosmicFuncotationFactory createCosmicDataSource(final Path dataSo
* @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}.
* @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}.
* @param featureInput The {@link FeatureInput<? extends Feature>} object for the Gencode data source we are creating.
* @param flankSettings Settings object containing our 5'/3' flank sizes
* @return A new {@link GencodeFuncotationFactory} based on the given data source file information, field overrides map, and transcript information.
*/
private static GencodeFuncotationFactory createGencodeDataSource(final Path dataSourceFile,
final Properties dataSourceProperties,
final LinkedHashMap<String, String> annotationOverridesMap,
final TranscriptSelectionMode transcriptSelectionMode,
final Set<String> userTranscriptIdSet,
final FeatureInput<? extends Feature> featureInput) {
final FeatureInput<? extends Feature> featureInput,
final FlankSettings flankSettings) {

Utils.nonNull(dataSourceFile);
Utils.nonNull(dataSourceProperties);
Utils.nonNull(annotationOverridesMap);
Utils.nonNull(transcriptSelectionMode);
Utils.nonNull(userTranscriptIdSet);
Utils.nonNull(flankSettings);

// Get some metadata:
final String fastaPath = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_GENCODE_FASTA_PATH);
Expand All @@ -526,7 +462,8 @@ private static GencodeFuncotationFactory createGencodeDataSource(final Path data
transcriptSelectionMode,
userTranscriptIdSet,
annotationOverridesMap,
featureInput
featureInput,
flankSettings
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -767,9 +767,11 @@ public enum VariantClassification {
// Only for IGRs:
/** Intergenic region. Does not overlap any transcript. */
IGR("IGR", 20),
/** The variant is upstream of the chosen transcript (within 3kb) */
/** The variant is upstream of the chosen transcript */
FIVE_PRIME_FLANK("FIVE_PRIME_FLANK", 15),

/** The variant is downstream of the chosen transcript */
THREE_PRIME_FLANK("THREE_PRIME_FLANK", 16),

// These can be in Coding regions or Introns (only SPLICE_SITE):
/** The point mutation alters the protein structure by one amino acid. */
MISSENSE("MISSENSE", 1),
Expand Down
Loading

0 comments on commit 4f36f93

Please sign in to comment.