Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addressing issue #4712 #4770

Merged
merged 2 commits into from
May 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 263 additions & 0 deletions scripts/funcotator/data_sources/getGencode.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
#!/usr/bin/env bash

################################################################################

# Creates a set of files that allows for creating annotations based off dbSNP
# Pulled directly from the ncbi FTP site.

################################################################################

#Setup variables for the script:
SCRIPTDIR="$( cd -P "$( dirname "$0" )" && pwd )"
SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
MINARGS=0
MAXARGS=0

# Latest release numbers for our references.
# Update these numbers when a new Gencode is released.
LATEST_HG19_RELEASE=19
LATEST_HG38_RELEASE=28

DATA_SOURCE_NAME="Gencode"
OUT_DIR_NAME='gencode'

# Check if we have samtools installed so we can index our fasta files:
HAS_SAMTOOLS=false
which samtools &> /dev/null
r=$?
[[ $r -eq 0 ]] && HAS_SAMTOOLS=true

################################################################################

set -e

################################################################################

function simpleUsage()
{
echo -e "Usage: $SCRIPTNAME [OPTIONS] ..."
echo -e "Creates the data sources folder for gencode for the GATK Funcotator tool."
}

#Define a usage function:
function usage()
{
simpleUsage
echo -e ""
echo -e "Will download all data sources directly from the Gencode website:"
echo -e " https://www.gencodegenes.org"
echo -e ""
echo -e "Return values:"
echo -e " 0 NORMAL"
echo -e " 1 TOO MANY ARGUMENTS"
echo -e " 2 TOO FEW ARGUMENTS"
echo -e " 3 UNKNOWN ARGUMENT"
echo -e " 4 BAD CHECKSUM"
echo -e " 5 OUTPUT DIRECTORY ALREADY EXISTS"
echo -e ""
}

#Display a message to std error:
function error()
{
echo "$1" 2>&1
}

TMPFILELIST=''
function makeTemp()
{
local f
f=$( mktemp )
TMPFILELIST="${TMPFILELIST} $f"
echo $f
}

function cleanTempVars()
{
rm -f ${TMPFILELIST}
}

function at_exit()
{
cleanTempVars
}

################################################################################


function createConfigFile() {

local dataSourceName=$1
local version=$2
local srcFile=$3
local originLocation=$4
local fastaPath=$5

echo "name = ${dataSourceName}"
echo "version = ${version}"
echo "src_file = ${srcFile}"
echo "origin_location = ${originLocation}"
echo "preprocessing_script = ${SCRIPTNAME} , fixGencodeOrdering.py"
echo ""
echo "# Supported types:"
echo "# simpleXSV -- Arbitrary separated value table (e.g. CSV), keyed off Gene Name OR Transcript ID"
echo "# locatableXSV -- Arbitrary separated value table (e.g. CSV), keyed off a genome location"
echo "# gencode -- Custom datasource class for GENCODE"
echo "# cosmic -- Custom datasource class for COSMIC"
echo "# vcf -- Custom datasource class for Variant Call Format (VCF) files"
echo "type = gencode"
echo ""
echo "# Required field for GENCODE files."
echo "# Path to the FASTA file from which to load the sequences for GENCODE transcripts:"
echo "gencode_fasta_path = ${fastaPath}"
echo ""
echo "# Required field for simpleXSV files."
echo "# Valid values:"
echo "# GENE_NAME"
echo "# TRANSCRIPT_ID"
echo "xsv_key ="
echo ""
echo "# Required field for simpleXSV files."
echo "# The 0-based index of the column containing the key on which to match"
echo "xsv_key_column ="
echo ""
echo "# Required field for simpleXSV AND locatableXSV files."
echo "# The delimiter by which to split the XSV file into columns."
echo "xsv_delimiter ="
echo ""
echo "# Required field for simpleXSV files."
echo "# Whether to permissively match the number of columns in the header and data rows"
echo "# Valid values:"
echo "# true"
echo "# false"
echo "xsv_permissive_cols ="
echo ""
echo "# Required field for locatableXSV files."
echo "# The 0-based index of the column containing the contig for each row"
echo "contig_column ="
echo ""
echo "# Required field for locatableXSV files."
echo "# The 0-based index of the column containing the start position for each row"
echo "start_column ="
echo ""
echo "# Required field for locatableXSV files."
echo "# The 0-based index of the column containing the end position for each row"
echo "end_column ="
echo ""

}

function getGencodeFiles()
{
local version=$1
local refVersion=$2

echo "##################################################################################"
echo "Processing ${refVersion} - Gencode ${version} ..."
echo "===================================="

mkdir -p ${OUT_DIR_NAME}/${refVersion}
pushd ${OUT_DIR_NAME}/${refVersion} &> /dev/null
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_${version}/gencode.v${version}.annotation.gtf.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_${version}/gencode.v${version}.pc_transcripts.fa.gz

gunzip gencode.v${version}.annotation.gtf.gz
gunzip gencode.v${version}.pc_transcripts.fa.gz

# We must fix the information in the gencode gtf file:
echo "Reordering Gencode GTF data ..."
${SCRIPTDIR}/fixGencodeOrdering.py gencode.v${version}.annotation.gtf > gencode.v${version}.annotation.REORDERED.gtf

# Clean up original file:
rm gencode.v${version}.annotation.gtf

echo "Creating config file ..."
createConfigFile "${DATA_SOURCE_NAME}" "${version}" "gencode.v${version}.annotation.REORDERED.gtf" "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_${version}/gencode.v${version}.annotation.gtf.gz" "gencode.v${version}.pc_transcripts.fa" > gencode.config

if $HAS_SAMTOOLS ; then
echo "Indexing Fasta File: gencode.v${version}.pc_transcripts.fa"
samtools faidx gencode.v${version}.pc_transcripts.fa
fi

echo
popd > /dev/null
}

################################################################################

trap at_exit EXIT

################################################################################

#Check given arguments:
if [[ $# -gt $MAXARGS ]] ; then
usage
exit 1
elif [[ $# -lt $MINARGS ]] ; then
usage
exit 2
fi

################################################################################

#Read args:
while [ $# -gt 0 ] ; do

case "$1" in
-h|--h|--help|-help|help)
usage;
exit 0;
;;
*)
error "${SCRIPTNAME}: invalid option: $1"
simpleUsage
echo "Try \`$SCRIPTNAME --help' for more information."
exit 3;
;;
esac

#Get next argument in $1:
shift
done

################################################################################
# Do the work here:

# Make sure we don't have anything in our out folder already:
if [[ -d ${OUT_DIR_NAME} ]] ; then
error "Output directory already exists: ${OUT_DIR_NAME} - aborting!"
exit 5
fi

# Get the link for HG19:
getGencodeFiles $LATEST_HG19_RELEASE hg19

# Get the link for HG38:
getGencodeFiles $LATEST_HG38_RELEASE hg38

if ! $HAS_SAMTOOLS ; then
echo -e "\033[1;33;40m##################################################################################\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m index both Gencode Fasta files before using this data source \033[1;33;40m#\033[0;0m"
echo -e "\033[1;33;40m# Use samtools faidx <FASTA_FILE> #\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m##################################################################################\033[0;0m"
fi


echo -e "\033[1;33;40m##################################################################################\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m create sequence dictionaries for both Gencode Fasta files #\033[0;0m"
echo -e "\033[1;33;40m# before using this data source. #\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m# Use gatk CreateSequenceDictionary <FASTA_FILE> #\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m ALSO index the Gencode GTF files for both HG19 and HG38 #\033[0;0m"
echo -e "\033[1;33;40m# before using this data source. #\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m# Use gatk IndexFeatureFile <GTF_FILE> #\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m##################################################################################\033[0;0m"

exit 0

Original file line number Diff line number Diff line change
Expand Up @@ -589,12 +589,15 @@ static boolean validateVersionInformation(final int major, final int minor, fina
return false;
}

if ( month < MIN_MONTH_RELEASED ) {
return false;
}

if ( day < MIN_DAY_RELEASED ) {
return false;
else if ( year == MIN_YEAR_RELEASED ) {
if ( month < MIN_MONTH_RELEASED ) {
return false;
}
else if ( month == MIN_MONTH_RELEASED ) {
if ( day < MIN_DAY_RELEASED ) {
return false;
}
}
}

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory {
*/
private static final int gcContentWindowSizeBases = 200;

/**
* The number of leading bases to include when building the variant sequence for UTR variants.
* Used to determine if there is a de novo start.
*/
private static final int numLeadingBasesForUtrAnnotationSequenceConstruction = 2;

/**
* The number of leading bases to include when building the variant sequence for UTR variants.
* * Used to determine if there is a de novo start.
*/
private static final int defaultNumTrailingBasesForUtrAnnotationSequenceConstruction = 3;

/**
* The window around a variant to include in the reference context annotation.
* Also used for context from which to get surrounding codon changes and protein changes.
Expand Down Expand Up @@ -1049,7 +1061,6 @@ private GencodeFuncotation createUtrFuncotation(final VariantContext variant,
// Get the strand-corrected alleles from the inputs.
// Also get the reference sequence for the variant region.
// (spanning the entire length of both the reference and the variant, regardless of which is longer).
final Allele strandCorrectedRefAllele = FuncotatorUtils.getStrandCorrectedAllele(variant.getReference(), strand);
final Allele strandCorrectedAltAllele = FuncotatorUtils.getStrandCorrectedAllele(altAllele, strand);
final String referenceBases = getReferenceBases(variant.getReference(), altAllele, reference, strand);

Expand All @@ -1074,32 +1085,37 @@ private GencodeFuncotation createUtrFuncotation(final VariantContext variant,
// Get the 5' UTR sequence here.
// Note: We grab 3 extra bases at the end (from the coding sequence) so that we can check for denovo starts
// even if the variant occurs in the last base of the UTR.
final int numExtraTrailingBases = variant.getReference().length() < defaultNumTrailingBasesForUtrAnnotationSequenceConstruction ? defaultNumTrailingBasesForUtrAnnotationSequenceConstruction : variant.getReference().length() + 1;
final String fivePrimeUtrCodingSequence =
getFivePrimeUtrSequenceFromTranscriptFasta( transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource, 3);
getFivePrimeUtrSequenceFromTranscriptFasta( transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource, numExtraTrailingBases);

final int codingStartPos = FuncotatorUtils.getStartPositionInTranscript(variant, activeRegions, strand);

// Get the number of bases we need to add to the start position of the variant to get it in frame:
final int alignedAlleleOffset = FuncotatorUtils.getAlignedPosition(variant.getStart()) - variant.getStart();

// But we can really just use the referenceBases to do this:
final String altAlleleCodon = (referenceBases.substring(referenceWindow-2, referenceWindow) +
strandCorrectedAltAllele +
referenceBases.substring(referenceWindow+strandCorrectedAltAllele.length(), referenceWindow + 3)).substring(2 + alignedAlleleOffset,5 + alignedAlleleOffset);

// Check for de novo starts:
// NOTE: Subtract 1 to account for the 1-based, inclusive nature of genetic coordinates:
if ( FuncotatorUtils.getEukaryoticAminoAcidByCodon( altAlleleCodon ) == AminoAcid.METHIONINE ) {

// We know we have a new start.
// Is it in frame or out of frame?
if ( FuncotatorUtils.isInFrameWithEndOfRegion(codingStartPos, fivePrimeUtrCodingSequence.length()) ) {
final String rawAltUtrSubSequence = (referenceBases.substring(referenceWindow-numLeadingBasesForUtrAnnotationSequenceConstruction, referenceWindow) +
strandCorrectedAltAllele +
referenceBases.substring(referenceWindow + variant.getReference().length(), referenceWindow + numExtraTrailingBases));

// Check for de novo starts in the raw sequence:
boolean startFound = false;
int codingRegionOffset = -numLeadingBasesForUtrAnnotationSequenceConstruction;
for ( int i = 0; (i+3 < rawAltUtrSubSequence.length()) ; ++i ) {
startFound = FuncotatorUtils.getEukaryoticAminoAcidByCodon( rawAltUtrSubSequence.substring(i, i+3) ) == AminoAcid.METHIONINE;
if (startFound) {
codingRegionOffset += i;
break;
}
}
// If we found a start codon, we should set the variant classification as such:
if ( startFound ) {
if ( FuncotatorUtils.isInFrameWithEndOfRegion(codingStartPos + codingRegionOffset, fivePrimeUtrCodingSequence.length()) ) {
gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.DE_NOVO_START_IN_FRAME);
}
else {
gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.DE_NOVO_START_OUT_FRAME);
}
}
// Just a normal 5' variant:
else {
gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.FIVE_PRIME_UTR);
}
Expand Down Expand Up @@ -1425,9 +1441,8 @@ static SequenceComparison createSequenceComparison(final VariantContext variant,
if ( processSequenceInformation ) {
if ( transcriptIdMap.containsKey(transcript.getTranscriptId()) ) {

final String transcriptSequence;
// NOTE: This can't be null because of the Funcotator input args.
transcriptSequence = getCodingSequenceFromTranscriptFasta(transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource);
final String transcriptSequence = getCodingSequenceFromTranscriptFasta(transcript.getTranscriptId(), transcriptIdMap, transcriptFastaReferenceDataSource);

// Get the transcript sequence as described by the given exonPositionList:
sequenceComparison.setTranscriptCodingSequence(new ReferenceSequence(transcript.getTranscriptId(), transcript.getStart(), transcriptSequence.getBytes()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ final public class GencodeGtfCodec extends AbstractFeatureCodec<GencodeGtfFeatur
static final Logger logger = LogManager.getLogger(GencodeGtfCodec.class);

public static final int GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE = 19;
public static final int GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE = 27;
public static final int GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE = 28;

public static final String GENCODE_GTF_FILE_EXTENSION = "gtf";
public static final String GENCODE_GTF_FILE_PREFIX = "gencode";
Expand Down Expand Up @@ -610,7 +610,8 @@ static boolean validateHeader(final List<String> header, final boolean throwIfIn
}
}

if ( !header.get(2).endsWith("@sanger.ac.uk") ) {
if ( !header.get(2).endsWith("@sanger.ac.uk") &&
!header.get(2).endsWith("@ebi.ac.uk") ) {
if ( throwIfInvalid ) {
throw new UserException.MalformedFile(
"GENCODE GTF Header line 2 does not contain expected contact email address (" +
Expand Down
Loading