Skip to content

Commit

Permalink
Merge pull request pnrobinson#16 from pnrobinson/develop
Browse files Browse the repository at this point in the history
improving Jannovar output
  • Loading branch information
pnrobinson authored Oct 9, 2021
2 parents 2f5c465 + 3a6bb0c commit 4f856c3
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 22 deletions.
17 changes: 16 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,23 @@ The remaining data in the VCF file is transmitted as is.
If the user chooses the ``--annot`` option, then the VCF file is additionally annotated
with transcript level annotations for each variant. For instance,

.. code-block:: bash
$ java -jar target/vcf2omop.jar omop \
--stage stage_genomic.csv \
--vcf src/main/resources/sample-hg19.vcf \
--annot
This command will add Jannovar annotations such as the following to the INFO fields of variants: ::

JANNOVAR=ENST00000507810.1(EPB41L4A):n.953-3219A>C (non_coding_transcript_intron_variant)
JANNOVAR=intergenic_variant
JANNOVAR=ENST00000257430.4(APC):c.136-230C>A (p.(=);coding_transcript_intron_variant)
JANNOVAR=ENST00000231136.1(PCDHB6):c.1908C>G (p.(H636Q);missense_variant)
JANNOVAR=ENST00000434307.2(SPAG11A):c.310T>G (p.(*104Eext*20);stop_lost)
JANNOVAR=ENST00000230354.6(TBP):c.213_215dup (p.(Q95dup);disruptive_inframe_insertion)

to do improve Jannovar annotations.
These annotations will be added to all variants (not just those with OMOP annotations).


Generate synonyms
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.monarchinitiative.omop</groupId>
<artifactId>vcf2omop</artifactId>
<version>0.6.1</version>
<version>0.6.2</version>

<name>vcf2omop</name>
<url>https://github.com/pnrobinson/vcf2omop</url>
Expand Down
100 changes: 80 additions & 20 deletions src/main/java/org/monarchinitiative/omop/analysis/Omopulator.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
package org.monarchinitiative.omop.analysis;

import com.google.common.collect.ImmutableMap;
import de.charite.compbio.jannovar.annotation.Annotation;
import de.charite.compbio.jannovar.annotation.VariantAnnotations;
import de.charite.compbio.jannovar.annotation.VariantAnnotator;
import de.charite.compbio.jannovar.annotation.builders.AnnotationBuilderOptions;
import de.charite.compbio.jannovar.data.*;
import de.charite.compbio.jannovar.htsjdk.VariantContextAnnotator;
import de.charite.compbio.jannovar.reference.GenomePosition;
import de.charite.compbio.jannovar.reference.GenomeVariant;
import de.charite.compbio.jannovar.reference.PositionType;
import de.charite.compbio.jannovar.reference.Strand;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
Expand All @@ -20,7 +27,6 @@

import java.io.*;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand All @@ -34,21 +40,26 @@ public class Omopulator {

private final String vcfFilePath;

private final List<OmopAnnotatedVariant> variantAnnotations;

private final Map<VcfVariant, Integer> variant2omopIdMap;

private final VariantContextAnnotator variantEffectAnnotator;
//in cases where a variant cannot be positioned on a chromosome we're going to use 0 in order to fulfil the
//requirement of a variant having an integer chromosome
private static final int UNKNOWN_CHROMOSOME = 0;

private final ReferenceDictionary referenceDictionary;
private final VariantAnnotator variantAnnotator;

private static final String OMOP_FLAG_FIELD_NAME = "OMOP";

private static final String JANNOVAR_FLAG_FIELD_NAME = "ANN";
private static final String JANNOVAR_FLAG_FIELD_NAME = "JANNOVAR";

private final boolean transcriptAnnotations;




/**
* @param jannovarPath Path to Jannovar transcript file (use download command to get them in the data subdirectory)
* @param jannovarPath Path to Jannovar transcript file (use download command to get them in the data subdirectory)
* @param vcfPath path to input VCF file
* @param assembly Must be one of GRCh19 or GRCh38
* @param stagedVariantList variants contained in the OMOP list
Expand All @@ -57,6 +68,10 @@ public class Omopulator {
public Omopulator(String jannovarPath, String vcfPath, String assembly, List<OmopStagedVariant> stagedVariantList, boolean annotations) {
variant2omopIdMap = new HashMap<>();
this.transcriptAnnotations = annotations;



// this.jannovarVariantAnnotator = new JannovarVariantAnnotator(genomeAssembly, jannovarData, emptyRegionIndex);
for (OmopStagedVariant e : stagedVariantList) {
variant2omopIdMap.put(e.toVcfVariant(), e.getOmopId());
}
Expand All @@ -67,17 +82,15 @@ public Omopulator(String jannovarPath, String vcfPath, String assembly, List<Omo
}
System.out.printf("[INFO] We ingested %d transcripts from %s.\n",
this.jannovarData.getTmByAccession().size(), jannovarPath);
File f = new File(vcfPath);
this.referenceDictionary = this.jannovarData.getRefDict();
this.variantAnnotator = new VariantAnnotator(jannovarData.getRefDict(), jannovarData.getChromosomes(), new AnnotationBuilderOptions());
File f = new File(vcfPath);
if (!f.exists()) {
throw new RuntimeException("Could not find VCF file at " + vcfPath);
}
ReferenceDictionary refDict = jannovarData.getRefDict();
ImmutableMap<Integer, Chromosome> chromosomeMap = jannovarData.getChromosomes();
this.vcfFilePath = f.getAbsolutePath();
this.variantAnnotations = new ArrayList<>();
this.variantEffectAnnotator =
new VariantContextAnnotator(refDict, chromosomeMap,
new VariantContextAnnotator.Options());
}

/**
Expand All @@ -89,10 +102,6 @@ public Omopulator(String jannovarPath, String vcfPath, String assembly, List<Omo
*/
private Function<VariantContext, VariantContext> addInfoFields() {
return vc -> {
// first do Jannovar annotations
if (transcriptAnnotations) {
vc = variantEffectAnnotator.annotateVariantContext(vc);
}
VariantContextBuilder builder = new VariantContextBuilder(vc);
// now look for OMOP matches
String contig = vc.getContig();
Expand All @@ -105,11 +114,66 @@ private Function<VariantContext, VariantContext> addInfoFields() {
int omopId = variant2omopIdMap.get(variant);
builder.attribute(OMOP_FLAG_FIELD_NAME, String.valueOf(omopId));
}
if (transcriptAnnotations) {
VariantAnnotations annots = annotateVariant(contig, start, ref, alt);
String mostPathogenicAnnot = getSimpleAnnotationString(annots);
System.out.println(mostPathogenicAnnot);
builder.attribute(JANNOVAR_FLAG_FIELD_NAME, mostPathogenicAnnot);
}
}
return builder.make();
};
}

private String getSimpleAnnotationString(VariantAnnotations annots) {
Annotation highest = annots.getHighestImpactAnnotation();
if (highest.getMostPathogenicVarType().isOffTranscript()) {
// intergenic
return highest.getMostPathogenicVarType().getSequenceOntologyTerm();
}
String transcript = highest.getTranscript().getAccession();
String geneSymbol = highest.getGeneSymbol();
String cdsChange = highest.getCDSNTChangeStr();
String effect = highest.getMostPathogenicVarType().getSequenceOntologyTerm();
if (! highest.getTranscript().isCoding()) {
return String.format("%s(%s):%s (%s)", transcript, geneSymbol, cdsChange, effect);
}
String proteinChange = highest.getProteinChangeStr();
return String.format("%s(%s):%s (%s;%s)", transcript, geneSymbol, cdsChange, proteinChange, effect);
}

/**
* Takes VCF (forward-strand, one-based) style variants and returns a set of Jannovar {@link VariantAnnotations}.
*
* @param contig contig (chromosome) where the variant is location
* @param pos variant position on the contig
* @param ref reference sequence
* @param alt alternate sequence
* @return a set of {@link VariantAnnotations} for the given variant coordinates. CAUTION! THE RETURNED ANNOTATIONS
* WILL USE ZERO-BASED COORDINATES AND WILL BE TRIMMED LEFT SIDE FIRST, ie. RIGHT SHIFTED. This is counter to VCF
* conventions.
*/
public VariantAnnotations annotateVariant(String contig, int pos, String ref, String alt) {
int chr = referenceDictionary.getContigNameToID().getOrDefault(contig, UNKNOWN_CHROMOSOME);
GenomePosition genomePosition = new GenomePosition(referenceDictionary, Strand.FWD, chr, pos, PositionType.ONE_BASED);
GenomeVariant genomeVariant = new GenomeVariant(genomePosition, ref, alt);
if (chr == UNKNOWN_CHROMOSOME) {
return VariantAnnotations.buildEmptyList(genomeVariant);
}
try {
return variantAnnotator.buildAnnotations(genomeVariant);
} catch (Exception e) {
logger.debug("Unable to annotate variant {}-{}-{}-{}",
genomeVariant.getChrName(),
genomeVariant.getPos(),
genomeVariant.getRef(),
genomeVariant.getAlt(),
e);
}
return VariantAnnotations.buildEmptyList(genomeVariant);
}



/**
* Add OMOP annotations to matching variants in a VCF file by adding corresponding annotations to the
Expand Down Expand Up @@ -164,8 +228,4 @@ private VCFHeader prepareVcfHeader(Path inputVcfPath) {
return header;
}

public List<OmopAnnotatedVariant> getVariantAnnotations() {
return this.variantAnnotations;
}

}

0 comments on commit 4f856c3

Please sign in to comment.