broadinstitute · jonn-smith · May 18, 2018 · May 15, 2018 · May 15, 2018
diff --git a/scripts/funcotator/data_sources/getGencode.sh b/scripts/funcotator/data_sources/getGencode.sh
@@ -0,0 +1,263 @@
+#!/usr/bin/env bash
+
+################################################################################
+
+# Creates a set of files that allows for creating annotations based off dbSNP
+# Pulled directly from the ncbi FTP site.
+
+################################################################################
+
+#Setup variables for the script:
+SCRIPTDIR="$( cd -P "$( dirname "$0" )" && pwd )"
+SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
+MINARGS=0
+MAXARGS=0
+
+# Latest release numbers for our references.
+# Update these numbers when a new Gencode is released.
+LATEST_HG19_RELEASE=19
+LATEST_HG38_RELEASE=28
+
+DATA_SOURCE_NAME="Gencode"
+OUT_DIR_NAME='gencode'
+
+# Check if we have samtools installed so we can index our fasta files:
+HAS_SAMTOOLS=false
+which samtools &> /dev/null
+r=$?
+[[ $r -eq 0 ]] && HAS_SAMTOOLS=true
+
+################################################################################
+
+set -e
+
+################################################################################
+
+function simpleUsage()
+{
+  echo -e "Usage: $SCRIPTNAME [OPTIONS] ..."
+  echo -e "Creates the data sources folder for gencode for the GATK Funcotator tool."
+}
+
+#Define a usage function:
+function usage()
+{
+  simpleUsage
+  echo -e ""
+  echo -e "Will download all data sources directly from the Gencode website:"
+  echo -e "    https://www.gencodegenes.org"
+  echo -e ""
+  echo -e "Return values:"
+  echo -e "  0  NORMAL"
+  echo -e "  1  TOO MANY ARGUMENTS"
+  echo -e "  2  TOO FEW ARGUMENTS"
+  echo -e "  3  UNKNOWN ARGUMENT"
+  echo -e "  4  BAD CHECKSUM"
+  echo -e "  5  OUTPUT DIRECTORY ALREADY EXISTS"
+  echo -e ""
+}
+
+#Display a message to std error:
+function error()
+{
+  echo "$1" 2>&1
+}
+
+TMPFILELIST=''
+function makeTemp()
+{
+  local f
+  f=$( mktemp )
+  TMPFILELIST="${TMPFILELIST} $f"
+  echo $f
+}
+
+function cleanTempVars()
+{
+  rm -f ${TMPFILELIST}
+}
+
+function at_exit()
+{
+  cleanTempVars
+}
+
+################################################################################
+
+
+function createConfigFile() {
+
+    local dataSourceName=$1
+    local version=$2
+    local srcFile=$3
+    local originLocation=$4
+    local fastaPath=$5
+
+    echo "name = ${dataSourceName}"
+    echo "version = ${version}"
+    echo "src_file = ${srcFile}"
+    echo "origin_location = ${originLocation}"
+    echo "preprocessing_script = ${SCRIPTNAME} , fixGencodeOrdering.py"
+    echo ""
+    echo "# Supported types:"
+    echo "# simpleXSV    -- Arbitrary separated value table (e.g. CSV), keyed off Gene Name OR Transcript ID"
+    echo "# locatableXSV -- Arbitrary separated value table (e.g. CSV), keyed off a genome location"
+    echo "# gencode      -- Custom datasource class for GENCODE"
+    echo "# cosmic       -- Custom datasource class for COSMIC"
+    echo "# vcf          -- Custom datasource class for Variant Call Format (VCF) files"
+    echo "type = gencode"
+    echo ""
+    echo "# Required field for GENCODE files."
+    echo "# Path to the FASTA file from which to load the sequences for GENCODE transcripts:"
+    echo "gencode_fasta_path = ${fastaPath}"
+    echo ""
+    echo "# Required field for simpleXSV files."
+    echo "# Valid values:"
+    echo "#     GENE_NAME"
+    echo "#     TRANSCRIPT_ID"
+    echo "xsv_key ="
+    echo ""
+    echo "# Required field for simpleXSV files."
+    echo "# The 0-based index of the column containing the key on which to match"
+    echo "xsv_key_column ="
+    echo ""
+    echo "# Required field for simpleXSV AND locatableXSV files."
+    echo "# The delimiter by which to split the XSV file into columns."
+    echo "xsv_delimiter ="
+    echo ""
+    echo "# Required field for simpleXSV files."
+    echo "# Whether to permissively match the number of columns in the header and data rows"
+    echo "# Valid values:"
+    echo "#     true"
+    echo "#     false"
+    echo "xsv_permissive_cols ="
+    echo ""
+    echo "# Required field for locatableXSV files."
+    echo "# The 0-based index of the column containing the contig for each row"
+    echo "contig_column ="
+    echo ""
+    echo "# Required field for locatableXSV files."
+    echo "# The 0-based index of the column containing the start position for each row"
+    echo "start_column ="
+    echo ""
+    echo "# Required field for locatableXSV files."
+    echo "# The 0-based index of the column containing the end position for each row"
+    echo "end_column ="
+    echo ""
+
+}
+
+function getGencodeFiles()
+{
+    local version=$1
+    local refVersion=$2
+
+    echo "##################################################################################"
+    echo "Processing ${refVersion} - Gencode ${version} ..."
+    echo "===================================="
+
+    mkdir -p ${OUT_DIR_NAME}/${refVersion}
+    pushd ${OUT_DIR_NAME}/${refVersion} &> /dev/null
+    wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_${version}/gencode.v${version}.annotation.gtf.gz
+    wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_${version}/gencode.v${version}.pc_transcripts.fa.gz
+
+    gunzip gencode.v${version}.annotation.gtf.gz
+    gunzip gencode.v${version}.pc_transcripts.fa.gz
+
+    # We must fix the information in the gencode gtf file:
+    echo "Reordering Gencode GTF data ..."
+    ${SCRIPTDIR}/fixGencodeOrdering.py gencode.v${version}.annotation.gtf > gencode.v${version}.annotation.REORDERED.gtf
+
+    # Clean up original file:
+    rm gencode.v${version}.annotation.gtf
+
+    echo "Creating config file ..."
+    createConfigFile "${DATA_SOURCE_NAME}" "${version}" "gencode.v${version}.annotation.REORDERED.gtf" "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_${version}/gencode.v${version}.annotation.gtf.gz" "gencode.v${version}.pc_transcripts.fa" > gencode.config
+
+    if $HAS_SAMTOOLS ; then
+        echo "Indexing Fasta File: gencode.v${version}.pc_transcripts.fa"
+        samtools faidx gencode.v${version}.pc_transcripts.fa
+    fi
+
+    echo
+    popd > /dev/null
+}
+
+################################################################################
+
+trap at_exit EXIT
+
+################################################################################
+
+#Check given arguments:
+if [[ $# -gt $MAXARGS ]] ; then
+  usage
+  exit 1
+elif [[ $# -lt $MINARGS ]] ; then
+  usage
+  exit 2
+fi
+
+################################################################################
+
+#Read args:
+while [ $# -gt 0 ] ; do
+
+  case "$1" in
+    -h|--h|--help|-help|help)
+      usage;
+      exit 0;
+      ;;
+    *)
+      error "${SCRIPTNAME}: invalid option: $1"
+      simpleUsage
+      echo "Try \`$SCRIPTNAME --help' for more information."
+      exit 3;
+    ;;
+  esac
+
+  #Get next argument in $1:
+  shift
+done
+
+################################################################################
+# Do the work here:
+
+# Make sure we don't have anything in our out folder already:
+if [[ -d ${OUT_DIR_NAME} ]] ; then
+    error "Output directory already exists: ${OUT_DIR_NAME} - aborting!"
+    exit 5
+fi
+
+# Get the link for HG19:
+getGencodeFiles $LATEST_HG19_RELEASE hg19
+
+# Get the link for HG38:
+getGencodeFiles $LATEST_HG38_RELEASE hg38
+
+if ! $HAS_SAMTOOLS ; then
+    echo -e "\033[1;33;40m##################################################################################\033[0;0m"
+    echo -e "\033[1;33;40m#                                                                                #\033[0;0m"
+    echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m index both Gencode Fasta files before using this data source \033[1;33;40m#\033[0;0m"
+    echo -e "\033[1;33;40m#             Use samtools faidx <FASTA_FILE>                                    #\033[0;0m"
+    echo -e "\033[1;33;40m#                                                                                #\033[0;0m"
+    echo -e "\033[1;33;40m##################################################################################\033[0;0m"
+fi
+
+
+echo -e "\033[1;33;40m##################################################################################\033[0;0m"
+echo -e "\033[1;33;40m#                                                                                #\033[0;0m"
+echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m create sequence dictionaries for both Gencode Fasta files #\033[0;0m"
+echo -e "\033[1;33;40m#             before using this data source.                                     #\033[0;0m"
+echo -e "\033[1;33;40m#                                                                                #\033[0;0m"
+echo -e "\033[1;33;40m#             Use gatk CreateSequenceDictionary <FASTA_FILE>                     #\033[0;0m"
+echo -e "\033[1;33;40m#                                                                                #\033[0;0m"
+echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m ALSO index the Gencode GTF files for both HG19 and HG38 #\033[0;0m"
+echo -e "\033[1;33;40m#             before using this data source.                                     #\033[0;0m"
+echo -e "\033[1;33;40m#                                                                                #\033[0;0m"
+echo -e "\033[1;33;40m#             Use gatk IndexFeatureFile <GTF_FILE>                               #\033[0;0m"
+echo -e "\033[1;33;40m#                                                                                #\033[0;0m"
+echo -e "\033[1;33;40m##################################################################################\033[0;0m"
+
+exit 0
+
diff --git a/...main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java b/...main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java
@@ -589,12 +589,15 @@ static boolean validateVersionInformation(final int major, final int minor, fina
             return false;
         }
 
-        if ( month < MIN_MONTH_RELEASED ) {
-            return false;
-        }
-
-        if ( day < MIN_DAY_RELEASED ) {
-            return false;
+        else if ( year == MIN_YEAR_RELEASED ) {
+            if ( month < MIN_MONTH_RELEASED ) {
+                return false;
+            }
+            else if ( month == MIN_MONTH_RELEASED ) {
+                if ( day < MIN_DAY_RELEASED ) {
+                    return false;
+                }
+            }
         }
 
         return true;

diff --git a/...dinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java b/...dinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java
@@ -56,6 +56,18 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory {
      */
     private static final int gcContentWindowSizeBases = 200;
 
+    /**
+     * The number of leading bases to include when building the variant sequence for UTR variants.
+     * Used to determine if there is a de novo start.
+     */
+    private static final int numLeadingBasesForUtrAnnotationSequenceConstruction = 2;
+
+    /**
+     * The number of leading bases to include when building the variant sequence for UTR variants.
+     * * Used to determine if there is a de novo start.
+     */
+    private static final int defaultNumTrailingBasesForUtrAnnotationSequenceConstruction = 3;
+
     /**
      * The window around a variant to include in the reference context annotation.
      * Also used for context from which to get surrounding codon changes and protein changes.
@@ -1049,7 +1061,6 @@ private GencodeFuncotation createUtrFuncotation(final VariantContext variant,
         // Get the strand-corrected alleles from the inputs.
         // Also get the reference sequence for the variant region.
         // (spanning the entire length of both the reference and the variant, regardless of which is longer).
-        final Allele strandCorrectedRefAllele = FuncotatorUtils.getStrandCorrectedAllele(variant.getReference(), strand);
         final Allele strandCorrectedAltAllele = FuncotatorUtils.getStrandCorrectedAllele(altAllele, strand);
         final String referenceBases = getReferenceBases(variant.getReference(), altAllele, reference, strand);
 
@@ -1074,32 +1085,37 @@ private GencodeFuncotation createUtrFuncotation(final VariantContext variant,
             // Get the 5' UTR sequence here.
             // Note: We grab 3 extra bases at the end (from the coding sequence) so that we can check for denovo starts
             //       even if the variant occurs in the last base of the UTR.
+            final int numExtraTrailingBases = variant.getReference().length() < defaultNumTrailingBasesForUtrAnnotationSequenceConstruction ? defaultNumTrailingBasesForUtrAnnotationSequenceConstruction : variant.getReference().length() + 1;
             final String fivePrimeUtrCodingSequence =
-                    getFivePrimeUtrSequenceFromTranscriptFasta( transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource, 3);
+                    getFivePrimeUtrSequenceFromTranscriptFasta( transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource, numExtraTrailingBases);
 
             final int codingStartPos = FuncotatorUtils.getStartPositionInTranscript(variant, activeRegions, strand);
 
-            // Get the number of bases we need to add to the start position of the variant to get it in frame:
-            final int alignedAlleleOffset = FuncotatorUtils.getAlignedPosition(variant.getStart()) - variant.getStart();
-
             // But we can really just use the referenceBases to do this:
-            final String altAlleleCodon = (referenceBases.substring(referenceWindow-2, referenceWindow) +
-                    strandCorrectedAltAllele +
-                    referenceBases.substring(referenceWindow+strandCorrectedAltAllele.length(), referenceWindow + 3)).substring(2 + alignedAlleleOffset,5 + alignedAlleleOffset);
-
-            // Check for de novo starts:
-            // NOTE: Subtract 1 to account for the 1-based, inclusive nature of genetic coordinates:
-            if ( FuncotatorUtils.getEukaryoticAminoAcidByCodon( altAlleleCodon ) == AminoAcid.METHIONINE ) {
-
-                // We know we have a new start.
-                // Is it in frame or out of frame?
-                if ( FuncotatorUtils.isInFrameWithEndOfRegion(codingStartPos, fivePrimeUtrCodingSequence.length()) ) {
+            final String rawAltUtrSubSequence = (referenceBases.substring(referenceWindow-numLeadingBasesForUtrAnnotationSequenceConstruction, referenceWindow) +
+                                    strandCorrectedAltAllele +
+                                    referenceBases.substring(referenceWindow + variant.getReference().length(), referenceWindow + numExtraTrailingBases));
+
+            // Check for de novo starts in the raw sequence:
+            boolean startFound = false;
+            int codingRegionOffset = -numLeadingBasesForUtrAnnotationSequenceConstruction;
+            for ( int i = 0; (i+3 < rawAltUtrSubSequence.length()) ; ++i ) {
+                startFound = FuncotatorUtils.getEukaryoticAminoAcidByCodon( rawAltUtrSubSequence.substring(i, i+3) ) == AminoAcid.METHIONINE;
+                if (startFound) {
+                    codingRegionOffset += i;
+                    break;
+                }
+            }
+            // If we found a start codon, we should set the variant classification as such:
+            if ( startFound ) {
+                if ( FuncotatorUtils.isInFrameWithEndOfRegion(codingStartPos + codingRegionOffset, fivePrimeUtrCodingSequence.length()) ) {
                     gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.DE_NOVO_START_IN_FRAME);
                 }
                 else {
                     gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.DE_NOVO_START_OUT_FRAME);
                 }
             }
+            // Just a normal 5' variant:
             else {
                 gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.FIVE_PRIME_UTR);
             }
@@ -1425,9 +1441,8 @@ static SequenceComparison createSequenceComparison(final VariantContext variant,
         if ( processSequenceInformation ) {
             if ( transcriptIdMap.containsKey(transcript.getTranscriptId()) ) {
 
-                final String transcriptSequence;
                 // NOTE: This can't be null because of the Funcotator input args.
-                transcriptSequence = getCodingSequenceFromTranscriptFasta(transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource);
+                final String transcriptSequence = getCodingSequenceFromTranscriptFasta(transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource);
 
                 // Get the transcript sequence as described by the given exonPositionList:
                 sequenceComparison.setTranscriptCodingSequence(new ReferenceSequence(transcript.getTranscriptId(), transcript.getStart(), transcriptSequence.getBytes()));

diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/gencode/GencodeGtfCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/gencode/GencodeGtfCodec.java
@@ -63,7 +63,7 @@ final public class GencodeGtfCodec extends AbstractFeatureCodec<GencodeGtfFeatur
     static final Logger logger = LogManager.getLogger(GencodeGtfCodec.class);
 
     public static final int GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE = 19;
-    public static final int GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE = 27;
+    public static final int GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE = 28;
 
     public static final String GENCODE_GTF_FILE_EXTENSION = "gtf";
     public static final String GENCODE_GTF_FILE_PREFIX = "gencode";
@@ -610,7 +610,8 @@ static boolean validateHeader(final List<String> header, final boolean throwIfIn
             }
         }
 
-        if ( !header.get(2).endsWith("@sanger.ac.uk") ) {
+        if ( !header.get(2).endsWith("@sanger.ac.uk") &&
+             !header.get(2).endsWith("@ebi.ac.uk") ) {
             if ( throwIfInvalid ) {
                 throw new UserException.MalformedFile(
                         "GENCODE GTF Header line 2 does not contain expected contact email address (" +