Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cmd line to VCF generated by GATKSparkTool #4981

Merged
merged 5 commits into from
Jul 23, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
Expand Down Expand Up @@ -836,17 +835,13 @@ protected SAMFileHeader getHeaderForSAMWriter(){
* date and command line, otherwise an empty set.
*/
protected Set<VCFHeaderLine> getDefaultToolVCFHeaderLines() {
final Set<VCFHeaderLine> gatkToolHeaderLines = new HashSet<>();
if (addOutputVCFCommandLine) {
final Map<String, String> simpleHeaderLineMap = new HashMap<>(4);
simpleHeaderLineMap.put("ID", this.getClass().getSimpleName());
simpleHeaderLineMap.put("Version", getVersion());
simpleHeaderLineMap.put("Date", Utils.getDateTimeForDisplay((ZonedDateTime.now())));
simpleHeaderLineMap.put("CommandLine", getCommandLine());
gatkToolHeaderLines.add(new VCFHeaderLine("source", this.getClass().getSimpleName()));
gatkToolHeaderLines.add(new VCFSimpleHeaderLine(String.format("%sCommandLine", getToolkitShortName()), simpleHeaderLineMap));
return GATKVariantContextUtils
.getDefaultVCFHeaderLines(getToolkitShortName(), this.getClass().getSimpleName(),
getVersion(), Utils.getDateTimeForDisplay((ZonedDateTime.now())), getCommandLine());
} else {
return new HashSet<>();
}
return gatkToolHeaderLines;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.*;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureManager;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.engine.TraversalParameters;
import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
import org.broadinstitute.hellbender.engine.datasources.ReferenceWindowFunctions;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureManager;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink;
Expand All @@ -25,16 +28,16 @@
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import org.broadinstitute.hellbender.utils.SerializableFunction;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.time.ZonedDateTime;
import java.util.*;

/**
* Base class for GATK spark tools that accept standard kinds of inputs (reads, reference, and/or intervals).
Expand Down Expand Up @@ -91,6 +94,9 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram {
@ArgumentCollection
protected SequenceDictionaryValidationArgumentCollection sequenceDictionaryValidationArguments = getSequenceDictionaryValidationArgumentCollection();

@Argument(fullName = StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, shortName = StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, doc = "If true, adds a command line header line to created VCF files.", optional=true, common = true)
public boolean addOutputVCFCommandLine = true;

@Argument(doc = "For tools that write an output, write the output in multiple pieces (shards)",
fullName = SHARDED_OUTPUT_LONG_NAME,
optional = true,
Expand Down Expand Up @@ -404,6 +410,30 @@ public List<Class<? extends Annotation>> getDefaultVariantAnnotationGroups() {
return Collections.emptyList();
}

// TODO: 7/9/18 the two function below (including the todo in comment) are copy-pasted from GATKTool, and probably some refactoring can be done
/**
* @return An abbreviated name of the toolkit for this tool. Subclasses may override to provide
* a custom toolkit name.
*
* TODO: This should be refactored and moved up into CommandLineProgram, with this value
* TODO: stored in the jar manifest, like {@link CommandLineProgram#getToolkitName}
*/
protected String getToolkitShortName() { return "GATK"; }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, we might as well move this method up into CommandLineProgram now, adjacent to getToolkitName, along with a DEFAULT_TOOLKIT_SHORT_NAME static constant, and return that. Then we can remove the two identical implementations. Also, lets keep the second part of the TODO comment with the new code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refactored, not sure if they are what you mean?


/**
* @return If addOutputVCFCommandLine is true, a set of VCF header lines containing the tool name, version,
* date and command line, otherwise an empty set.
*/
protected Set<VCFHeaderLine> getDefaultToolVCFHeaderLines() {
if (addOutputVCFCommandLine) {
return GATKVariantContextUtils
.getDefaultVCFHeaderLines(getToolkitShortName(), this.getClass().getSimpleName(),
getVersion(), Utils.getDateTimeForDisplay((ZonedDateTime.now())), getCommandLine());
} else {
return new HashSet<>();
}
}

/**
* @see GATKTool#makeVariantAnnotations()
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,26 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.collections.Permutation;
import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.hellbender.utils.param.ParamUtils;

import java.io.File;
Expand Down Expand Up @@ -49,6 +50,23 @@ public static boolean isInformative(final double[] gls) {
return MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL;
}

/**
* @return A set of VCF header lines containing the tool name, version, date and command line.
*/
public static Set<VCFHeaderLine> getDefaultVCFHeaderLines(final String toolkitShortName, final String toolName,
final String versionString, final String dataTime,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dataTime -> dateTime

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, javadoc should include the param list with short descriptions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

final String cmdLine) {
final Set<VCFHeaderLine> defaultVCFHeaderLines = new HashSet<>();
final Map<String, String> simpleHeaderLineMap = new HashMap<>(4);
simpleHeaderLineMap.put("ID", toolName);
simpleHeaderLineMap.put("Version", versionString);
simpleHeaderLineMap.put("Date", dataTime);
simpleHeaderLineMap.put("CommandLine", cmdLine);
defaultVCFHeaderLines.add(new VCFHeaderLine("source", toolName));
defaultVCFHeaderLines.add(new VCFSimpleHeaderLine(String.format("%sCommandLine", toolkitShortName), simpleHeaderLineMap));
return defaultVCFHeaderLines;
}

/**
* Creates a VariantContextWriter whose outputFile type is based on the extension of the output file name.
* The default options set by VariantContextWriter are cleared before applying ALLOW_MISSING_FIELDS_IN_HEADER (if
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.broadinstitute.hellbender.engine.spark;

import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFIDHeaderLine;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.TestProgramGroup;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.util.Set;

public class GATKSparkToolUnitTest extends GATKBaseTest {

@CommandLineProgramProperties(
summary = "TestGATKSparkToolWithVariants",
oneLineSummary = "TestGATKSparkToolWithVariants",
programGroup = TestProgramGroup.class
)
public static class TestGATKSparkToolWithVariants extends GATKSparkTool {
private static final long serialVersionUID = 0L;

@Override
protected void runTool(JavaSparkContext ctx) {
//Do-Nothing
}
}
@Test
public void testGetDefaultToolVCFHeaderLines() {
final TestGATKSparkToolWithVariants tool = new TestGATKSparkToolWithVariants();
final String[] args = {"--" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "true"};
tool.instanceMain(args);

Set<VCFHeaderLine> stdHeaderLines = tool.getDefaultToolVCFHeaderLines();
VCFHeader hdr = new VCFHeader(stdHeaderLines);

VCFHeaderLine sourceLine = hdr.getOtherHeaderLine("source");
Assert.assertEquals(sourceLine.getValue(), tool.getClass().getSimpleName());

VCFIDHeaderLine commandLine = (VCFIDHeaderLine) hdr.getOtherHeaderLine("GATKCommandLine");
Assert.assertEquals(commandLine.getID(), tool.getClass().getSimpleName());

String commandLineString = commandLine.toString();
assertContains(commandLineString,"CommandLine=");
assertContains(commandLineString,"Version=");
assertContains(commandLineString,"Date=");
}
}