Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow for hdfs and gcs URI's to be passed to GenomicsDB #5197

Merged
merged 30 commits into from
Oct 17, 2018
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
fd1e170
Allow for hdfs and gcs URI's to be passed to GenomicsDB
nalinigans Jul 16, 2018
61cd1d9
Push the URI processing for GenomicsDB to BucketUtils
nalinigans Jul 17, 2018
6fcacc2
Merge remote-tracking branch 'upstream/master'
nalinigans Jul 17, 2018
3c87e98
Refine appendPathToDir
nalinigans Jul 17, 2018
35bea3c
Merge remote-tracking branch 'upstream/master'
nalinigans Jul 29, 2018
46ef8aa
Move to 0.10.0-proto-3.0.0-beta-1+d392491bafcac337 for GenomicsDB as …
nalinigans Jul 29, 2018
2e0a66f
Merge to latest
nalinigans Aug 15, 2018
855cff7
Incorporate changes suggested by @droazen in PR 5017
nalinigans Aug 19, 2018
9d1ff22
Merge remote-tracking branch 'upstream/master'
nalinigans Aug 19, 2018
a11d7a2
Migrate FeatureInput.java to use the new gendb://, gendb.gs:// and ge…
nalinigans Aug 20, 2018
b8650de
Add GenomicsDB tests for writing into and reading from GCS
nalinigans Aug 23, 2018
924cfea
Merge remote-tracking branch 'upstream/master'
nalinigans Aug 23, 2018
edd5cbe
Merge remote-tracking branch 'upstream/master'
nalinigans Aug 27, 2018
3d35059
Merge remote-tracking branch 'upstream/master'
nalinigans Sep 1, 2018
d430bbd
Try test without gc
nalinigans Sep 1, 2018
cddf276
Refactor gendb URI processing to IOUtils and cleanup GenomicsDBTestUt…
nalinigans Sep 10, 2018
5f4f6c6
Merge remote-tracking branch 'upstream/master'
nalinigans Sep 10, 2018
3576665
Delete GCS folders recursively in GCS unit tests
nalinigans Sep 11, 2018
acc5928
Address last few code review comments
droazen Sep 18, 2018
cec82e1
Debug Genomicsdb cloud tests (#5216)
nalinigans Sep 24, 2018
2bf479e
Merge with master. Conflicts resolved in src/main/java/org/broadinsti…
nalinigans Oct 14, 2018
63a77b8
Address requested changes in PR 5197
nalinigans Oct 15, 2018
89e69f7
Merge branch 'master' of https://github.com/broadinstitute/gatk into …
nalinigans Oct 16, 2018
5030569
Try GenomicsDB GCS tests
nalinigans Oct 16, 2018
4aed19c
Use java.nio instead of FileReader as GenomicsDB accepts gcs and hdfs…
nalinigans Oct 17, 2018
87a7ddc
Remove debug test
nalinigans Oct 17, 2018
dd6d5ac
Fix if statement indentation
droazen Oct 17, 2018
ce54a01
Delete stray import
droazen Oct 17, 2018
d7220cd
Delete another stray import
droazen Oct 17, 2018
48ab590
Fix test code formatting
droazen Oct 17, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,13 @@
import java.io.File;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;



/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
* on the genome in a format supported by our file parsing framework, Tribble. Examples of Features are
Expand Down Expand Up @@ -255,7 +254,7 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
// a query by interval is attempted.
this.featureReader = getFeatureReader(featureInput, targetFeatureType, cloudWrapper, cloudIndexWrapper, reference);

if (isGenomicsDBPath(featureInput.getFeaturePath())) {
if (IOUtils.isGenomicsDBPath(featureInput.getFeaturePath())) {
//genomics db uri's have no associated index file to read from, but they do support random access
this.hasIndex = false;
this.supportsRandomAccess = true;
Expand All @@ -277,12 +276,13 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
this.queryLookaheadBases = queryLookaheadBases;
}


@SuppressWarnings("unchecked")
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
final Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper,
final Path reference) {
if (isGenomicsDBPath(featureInput.getFeaturePath())) {
if (IOUtils.isGenomicsDBPath(featureInput.getFeaturePath())) {
try {
if (reference == null) {
throw new UserException.MissingReference("You must provide a reference if you want to load from GenomicsDB");
Expand Down Expand Up @@ -350,36 +350,22 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
}

protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
if (!isGenomicsDBPath(path)) {
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
}

final String noheader = path.replace(GENOMIC_DB_URI_SCHEME, "");
final File workspace = new File(noheader);
final File callsetJson = new File(noheader, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
final File vidmapJson = new File(noheader, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
final File vcfHeader = new File(noheader, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);

if (!workspace.exists() || !workspace.canRead() || !workspace.isDirectory()) {
throw new UserException("GenomicsDB workspace " + workspace.getAbsolutePath() + " does not exist, " +
" is not readable, or is not a directory");
final String workspace = IOUtils.getGenomicsDBAbsolutePath(path) ;
if (workspace == null) { throw new IllegalArgumentException("Trying to create a GenomicsDBReader from non-GenomicsDB inputpath " + path);
droazen marked this conversation as resolved.
Show resolved Hide resolved
} else if (Files.notExists(IOUtils.getPath(workspace))) {
throw new UserException("GenomicsDB workspace " + path + " does not exist");
}

try {
IOUtils.canReadFile(callsetJson);
IOUtils.canReadFile(vidmapJson);
IOUtils.canReadFile(vcfHeader);
} catch (final UserException.CouldNotReadInputFile e) {
throw new UserException("Couldn't connect to GenomicsDB because the vidmap, callset JSON files, or gVCF Header (" +
GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME + "," + GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME + "," +
GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME + ") could not be read from GenomicsDB workspace " + workspace.getAbsolutePath(), e);
}
final String callsetJson = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
final String vidmapJson = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
final String vcfHeader = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);

final GenomicsDBExportConfiguration.ExportConfiguration exportConfiguration =
createExportConfiguration(reference, workspace, callsetJson, vidmapJson, vcfHeader);
IOUtils.assertPathsAreReadable(callsetJson, vidmapJson, vcfHeader);

try {
return new GenomicsDBFeatureReader<>(exportConfiguration, new BCF2Codec(), Optional.empty());
final GenomicsDBExportConfiguration.ExportConfiguration exportConfigurationBuilder =
createExportConfiguration(reference, workspace, callsetJson, vidmapJson, vcfHeader);
return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new BCF2Codec(), Optional.empty());
} catch (final IOException e) {
throw new UserException("Couldn't create GenomicsDBFeatureReader", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;

Expand Down Expand Up @@ -245,8 +244,8 @@ public void setFeatureCodecClass(final Class<FeatureCodec<T, ?>> featureCodecCla
* creates a name from the given filePath by finding the absolute path of the given input
*/
private static String makeIntoAbsolutePath(final String filePath){
if(GenomicsDBUtils.isGenomicsDBPath(filePath)){
return GenomicsDBUtils.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(GenomicsDBUtils.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
if(IOUtils.isGenomicsDBPath(filePath)){
return IOUtils.getAbsolutePathWithGenomicsDBURIScheme(filePath);
} else if (URI.create(filePath).getScheme() != null) {
return IOUtils.getPath(filePath).toAbsolutePath().toUri().toString();
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.genomicsdb;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.intel.genomicsdb.GenomicsDBUtils;
import com.intel.genomicsdb.importer.GenomicsDBImporter;
import com.intel.genomicsdb.importer.model.ChromosomeInterval;
import com.intel.genomicsdb.model.Coordinates;
Expand Down Expand Up @@ -33,11 +34,11 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.io.File;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
Expand Down Expand Up @@ -298,13 +299,13 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
private SAMSequenceDictionary mergedHeaderSequenceDictionary;

// Path to vidmap file to be written by GenomicsDBImporter
private File vidMapJSONFile;
private String vidMapJSONFile;

// Path to callsetmap file to be written by GenomicsDBImporter
private File callsetMapJSONFile;
private String callsetMapJSONFile;

// Path to combined VCF header file to be written by GenomicsDBImporter
private File vcfHeaderFile;
private String vcfHeaderFile;

// GenomicsDB callset map protobuf structure containing all callset names
// used to write the callset json file on traversal success
Expand Down Expand Up @@ -475,17 +476,15 @@ public static SortedMap<String, URI> loadSampleNameMapFileInSortedOrder(final Pa
*/
@Override
public void onTraversalStart() {

final File workspaceDir = overwriteOrCreateWorkspace();

vidMapJSONFile = new File(workspaceDir + "/" + GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
callsetMapJSONFile = new File(workspaceDir + "/" + GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
vcfHeaderFile = new File(workspaceDir + "/" + GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
String workspaceDir = BucketUtils.makeFilePathAbsolute(overwriteOrCreateWorkspace());
vidMapJSONFile = IOUtils.appendPathToDir(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
callsetMapJSONFile = IOUtils.appendPathToDir(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
vcfHeaderFile = IOUtils.appendPathToDir(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);

logger.info("Vid Map JSON file will be written to " + vidMapJSONFile);
logger.info("Callset Map JSON file will be written to " + callsetMapJSONFile);
logger.info("Complete VCF Header will be written to " + vcfHeaderFile);
logger.info("Importing to array - " + workspace + "/" + GenomicsDBConstants.DEFAULT_ARRAY_NAME);
logger.info("Importing to array - " + workspaceDir + "/" + GenomicsDBConstants.DEFAULT_ARRAY_NAME);

initializeInputPreloadExecutorService();
}
Expand Down Expand Up @@ -547,9 +546,9 @@ private ImportConfig createImportConfig(final int batchSize) {
importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation);
ImportConfig importConfig = new ImportConfig(importConfigurationBuilder.build(), validateSampleToReaderMap, true,
batchSize, mergedHeaderLines, sampleNameToVcfPath, this::createSampleToReaderMap);
importConfig.setOutputCallsetmapJsonFile(callsetMapJSONFile.getAbsolutePath());
importConfig.setOutputVidmapJsonFile(vidMapJSONFile.getAbsolutePath());
importConfig.setOutputVcfHeaderFile(vcfHeaderFile.getAbsolutePath());
importConfig.setOutputCallsetmapJsonFile(callsetMapJSONFile);
importConfig.setOutputVidmapJsonFile(vidMapJSONFile);
importConfig.setOutputVcfHeaderFile(vcfHeaderFile);
importConfig.setUseSamplesInOrder(true);
importConfig.setFunctionToCallOnBatchCompletion(this::logMessageOnBatchCompletion);
return importConfig;
Expand Down Expand Up @@ -712,26 +711,22 @@ public VariantContext next() {
*
* @return The workspace directory
*/
private File overwriteOrCreateWorkspace() {
final File workspaceDir = new File(workspace);

if (overwriteExistingWorkspace) {
IOUtils.tryDelete(workspaceDir);
}

if (!workspaceDir.exists()) {
final int ret = GenomicsDBUtils.createTileDBWorkspace(workspaceDir.getAbsolutePath(), false);
if (ret > 0) {
checkIfValidWorkspace(workspaceDir);
logger.info("Importing data to GenomicsDB workspace: " + workspaceDir);
} else if (ret < 0) {
throw new UnableToCreateGenomicsDBWorkspace("Error creating GenomicsDB workspace: " + workspaceDir);
}
return workspaceDir;
private String overwriteOrCreateWorkspace() {
String workspaceDir = BucketUtils.makeFilePathAbsolute(workspace);
// From JavaDoc for GenomicsDBUtils.createTileDBWorkspace
// returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first.
// returnCode = -1 : path was not a directory
// returnCode = -2 : failed to create workspace
// returnCode = 1 : if overwriteExistingWorkspace is false, return 1 if directory already exists
int returnCode = GenomicsDBUtils.createTileDBWorkspace(workspaceDir, overwriteExistingWorkspace);
if (returnCode == -1) {
throw new UnableToCreateGenomicsDBWorkspace("Error creating GenomicsDB workspace: " + workspace + " already exists and is not a directory");
} else if (returnCode < 0) {
throw new UnableToCreateGenomicsDBWorkspace("Error creating GenomicsDB workspace: " + workspace);
nalinigans marked this conversation as resolved.
Show resolved Hide resolved
} else if (!overwriteExistingWorkspace && returnCode == 1) {
throw new UnableToCreateGenomicsDBWorkspace("Error creating GenomicsDB workspace: " + workspace + " already exists");
} else {
throw new UnableToCreateGenomicsDBWorkspace("The workspace you're trying to create already exists. ( " + workspaceDir.getAbsolutePath() + " ) " +
"Writing into an existing workspace can cause data corruption. " +
"Please choose an output path that doesn't already exist. ");
return workspaceDir;
}
}

Expand All @@ -743,13 +738,6 @@ static class UnableToCreateGenomicsDBWorkspace extends UserException {
}
}

private static void checkIfValidWorkspace(final File workspaceDir) {
final File tempFile = new File(workspaceDir.getAbsolutePath() + "/__tiledb_workspace.tdb");
if (!tempFile.exists()) {
throw new UserException(workspaceDir.getAbsolutePath() + " is not a valid GenomicsDB workspace");
}
}

/**
* Loads our intervals using the best available sequence
* dictionary (as returned by {@link #getBestAvailableSequenceDictionary})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,6 @@
*/
public class GenomicsDBUtils {

/**
* identifies a path as a GenomicsDB URI
*/
public static final String GENOMIC_DB_URI_SCHEME = "gendb://";

/**
* @param path String containing the path to test
* @return true if path represent a GenomicsDB URI, otherwise false
*/
public static boolean isGenomicsDBPath(final String path) {
return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
}

/**
*
* @param reference reference sequence
Expand All @@ -50,21 +37,21 @@ public static boolean isGenomicsDBPath(final String path) {
* @param vcfHeader VCF with header information to use for header lines on export
* @return a configuration to determine the output format when the GenomicsDB is queried
*/
public static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
final File callsetJson, final File vidmapJson,
final File vcfHeader) {
public static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final String workspace,
final String callsetJson, final String vidmapJson,
final String vcfHeader) {
final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
.setWorkspace(workspace.getAbsolutePath())
.setWorkspace(workspace)
.setReferenceGenome(reference.getAbsolutePath())
.setVidMappingFile(vidmapJson.getAbsolutePath())
.setCallsetMappingFile(callsetJson.getAbsolutePath())
.setVcfHeaderFilename(vcfHeader.getAbsolutePath())
.setVidMappingFile(vidmapJson)
.setCallsetMappingFile(callsetJson)
.setVcfHeaderFilename(vcfHeader)
.setProduceGTField(false)
.setProduceGTWithMinPLValueForSpanningDeletions(false)
.setSitesOnlyQuery(false)
.setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();
final Path arrayFolder = Paths.get(workspace, GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();

// For the multi-interval support, we create multiple arrays (directories) in a single workspace -
// one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
Expand Down Expand Up @@ -121,7 +108,7 @@ public static GenomicsDBExportConfiguration.ExportConfiguration createExportConf
* @param vidmapJson vid JSON file
* @return Protobuf object
*/
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final String vidmapJson)
throws IOException {
final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
try (final FileReader reader = new FileReader(vidmapJson)) {
Expand Down Expand Up @@ -177,4 +164,5 @@ public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation
}
return null;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
import java.io.*;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.UUID;
import java.util.stream.Collectors;

/**
* Utilities for dealing with google buckets.
Expand Down Expand Up @@ -173,6 +176,21 @@ public static void deleteFile(String pathToDelete) throws IOException {
}
}

/**
* Delete rootPath recursively using nio2
* @param rootPath is the file/directory to be deleted. rootPath can point to a <code>File</code> or a <code>URI</code>.
* @throws IOException
*/
public static void deleteRecursively(final String rootPath) throws IOException {
final List<java.nio.file.Path> pathsToDelete =
Files.walk(IOUtils.getPath(rootPath))
.sorted(Comparator.reverseOrder())
.collect(Collectors.toList());
for (java.nio.file.Path path : pathsToDelete) {
Files.deleteIfExists(path);
}
}

/**
* Get a temporary file path based on the prefix and extension provided.
* This file (and possible indexes associated with it) will be scheduled for deletion on shutdown
Expand Down Expand Up @@ -209,7 +227,11 @@ public static void deleteOnExit(String fileToDelete){
@Override
public void run() {
try {
deleteFile(fileToDelete);
if (Files.isDirectory(IOUtils.getPath(fileToDelete))) {
deleteRecursively(fileToDelete);
} else {
deleteFile(fileToDelete);
}
} catch (IOException e) {
logger.warn("Failed to delete file: " + fileToDelete+ ".", e);
}
Expand Down
Loading