Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated datasource version parsing #5149

Merged
merged 2 commits into from
Aug 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,14 @@ private DataSourceUtils() {}
private static final PathMatcher configFileMatcher =
FileSystems.getDefault().getPathMatcher("glob:**/*.config");

private static final String README_VERSION_LINE_START = "Version:";
private static final String README_SOURCE_LINE_START = "Source:";
private static final String README_ALT_SOURCE_LINE_START = "Alternate Source:";
private static final Pattern VERSION_PATTERN = Pattern.compile(README_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)");
private static final Pattern SOURCE_PATTERN = Pattern.compile(README_SOURCE_LINE_START + "\\s+(ftp.*)");
private static final Pattern ALT_SOURCE_PATTERN = Pattern.compile(README_ALT_SOURCE_LINE_START + "\\s+(gs.*)");
@VisibleForTesting
static final String MANIFEST_VERSION_LINE_START = "Version:";
private static final String MANIFEST_SOURCE_LINE_START = "Source:";
private static final String MANIFEST_ALT_SOURCE_LINE_START = "Alternate Source:";
@VisibleForTesting
static final Pattern VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)");
private static final Pattern SOURCE_PATTERN = Pattern.compile(MANIFEST_SOURCE_LINE_START + "\\s+(ftp.*)");
private static final Pattern ALT_SOURCE_PATTERN = Pattern.compile(MANIFEST_ALT_SOURCE_LINE_START + "\\s+(gs.*)");

// Track our minimum version number here:
@VisibleForTesting
Expand All @@ -63,19 +65,18 @@ private DataSourceUtils() {}
@VisibleForTesting
static final int MIN_YEAR_RELEASED = 2018;
@VisibleForTesting
static final int MIN_MONTH_RELEASED = 6;
static final int MIN_MONTH_RELEASED = 8;
@VisibleForTesting
static final int MIN_DAY_RELEASED = 15;
static final int MIN_DAY_RELEASED = 29;

//==================================================================================================================
// Public Static Members:

/** The minimum version of the data sources required for funcotator to run. */
public static final String CURRENT_MINIMUM_DATA_SOURCE_VERSION = String.format("v%d.%d.%d%02d%02d", MIN_MAJOR_VERSION_NUMBER, MIN_MINOR_VERSION_NUMBER, MIN_YEAR_RELEASED, MIN_MONTH_RELEASED, MIN_DAY_RELEASED);
public static final String README_FILE_NAME = "README.txt";
public static final String MANIFEST_FILE_NAME = "MANIFEST.txt";
Copy link
Contributor

@droazen droazen Aug 31, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you just rename README.txt to MANIFEST.txt, or is this a more substantial change? Ie., is MANIFEST.txt now a file with a well-defined structured format, or is it still basically a textual README with another name?

Copy link
Collaborator Author

@jonn-smith jonn-smith Aug 31, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@droazen I added a MANIFEST.txt file to the packages. MANIFEST.txt contains the version and provenance information for each data source package, as well as a new decorator/information field to distinguish somatic and germline files. This version information is duplicated in the readme, but the readme is not parsed.

MANIFEST.txt a plain-text formatted file with 1 field per line. The formatting is loosely enforced with the regex in DataSourceUtils and is of the syntax:

<FIELD>: <VALUE>

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, sounds good @jonn-smith, thanks for the clarification.

public static final String DATA_SOURCES_FTP_PATH = "ftp://[email protected]/bundle/funcotator/";
public static final String DATA_SOURCES_BUCKET_PATH = "gs://broad-public-datasets/funcotator/";

public static final String CONFIG_FILE_FIELD_NAME_NAME = "name";
public static final String CONFIG_FILE_FIELD_NAME_VERSION = "version";
public static final String CONFIG_FILE_FIELD_NAME_SRC_FILE = "src_file";
Expand All @@ -100,7 +101,7 @@ private DataSourceUtils() {}
* @param dataSourceDirectories A {@link List} of {@link Path} to the directories containing our data sources. Must not be {@code null}.
* @return The contents of the config files for each of the data sources found in the given {@code dataSourceDirectories}.
*/
public static Map<Path, Properties> getAndValidateDataSourcesFromPaths(final String refVersion,
public static Map<Path, Properties> getAndValidateDataSourcesFromPaths( final String refVersion,
final List<String> dataSourceDirectories) {
Utils.nonNull(refVersion);
Utils.nonNull(dataSourceDirectories);
Expand All @@ -115,21 +116,21 @@ public static Map<Path, Properties> getAndValidateDataSourcesFromPaths(final Str

logger.info("Initializing data sources from directory: " + pathString);

final Path p = IOUtils.getPath(pathString);
if ( !isValidDirectory(p) ) {
throw new UserException("ERROR: Given data source path is not a valid directory: " + p.toUri().toString());
final Path pathToDatasources = IOUtils.getPath(pathString);
if ( !isValidDirectory(pathToDatasources) ) {
throw new UserException("ERROR: Given data source path is not a valid directory: " + pathToDatasources.toUri());
}

// Log information from the datasources directory so we can have a record of what we're using:
final boolean isGoodVersionOfDataSources = logDataSourcesInfo(p);
final boolean isGoodVersionOfDataSources = logDataSourcesInfo(pathToDatasources);

if ( !isGoodVersionOfDataSources ) {
continue;
}

// Now that we have a valid directory, we need to grab a list of sub-directories in it:
try {
for ( final Path dataSourceTopDir : Files.list(p).filter(DataSourceUtils::isValidDirectory).collect(Collectors.toSet()) ) {
for ( final Path dataSourceTopDir : Files.list(pathToDatasources).filter(DataSourceUtils::isValidDirectory).collect(Collectors.toSet()) ) {

// Get the path that corresponds to our reference version:
final Path dataSourceDir = dataSourceTopDir.resolve(refVersion);
Expand Down Expand Up @@ -166,7 +167,7 @@ public static Map<Path, Properties> getAndValidateDataSourcesFromPaths(final Str
}
}
catch (final IOException ex) {
throw new GATKException("Unable to read contents of: " + p.toUri().toString(), ex);
throw new GATKException("Unable to read contents of: " + pathToDatasources.toUri().toString(), ex);
}
}

Expand Down Expand Up @@ -460,13 +461,15 @@ private static Properties readConfigFileProperties(final Path configFilePath) {
* We assume the data sources path is OK in the case that the version information cannot be read because the
* user can create their own data sources directory, which may not contain the metadata we seek.
*
* NOTE: The README file in a Data Sources directory is assumed to have the following properties:
* - Its name must be {@link #README_FILE_NAME}
* - It must contain a line starting with {@link #README_VERSION_LINE_START} containing an alphanumeric string containing the version number information.
* NOTE: The MANIFEST file in a Data Sources directory is assumed to have the following properties:
* - Its name must be {@link #MANIFEST_FILE_NAME}
* - It must contain a line starting with {@link #MANIFEST_VERSION_LINE_START} containing an alphanumeric string containing the version number information.
* - This version information takes the form of:
* [MAJOR_VERSION].[MINOR_VERSION].[RELEASE_YEAR][RELEASE_MONTH][RELEASE_DAY]
* [MAJOR_VERSION].[MINOR_VERSION].[RELEASE_YEAR][RELEASE_MONTH][RELEASE_DAY][VERSION_DECORATOR]?
* e.g.
* 1.1.20180204 (version 1.1 released Feb. 2, 2018)
* 1.1.20180204 (version 1.1 released Feb. 2, 2018)
* 4.2.20480608somatic (version 4.2 released June 6, 2048 - somatic data sources)
* 1.7.20190918X (version 1.7 released Sept. 18, 2048 - X data sources)
*
*
* @param dataSourcesPath {@link Path} to a Data Sources directory to check.
Expand All @@ -476,34 +479,36 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {

boolean dataSourcesPathIsAcceptable = true;

final Path readmePath = dataSourcesPath.resolve(IOUtils.getPath(README_FILE_NAME));
final Path manifestPath = dataSourcesPath.resolve(IOUtils.getPath(MANIFEST_FILE_NAME));

String version = null;

if ( Files.exists(readmePath) && Files.isRegularFile(readmePath) && Files.isReadable(readmePath) ) {
if ( Files.exists(manifestPath) && Files.isRegularFile(manifestPath) && Files.isReadable(manifestPath) ) {

try ( final BufferedReader reader = Files.newBufferedReader(readmePath) ) {
try ( final BufferedReader reader = Files.newBufferedReader(manifestPath) ) {

Integer versionMajor = null;
Integer versionMinor = null;
Integer versionYear = null;
Integer versionMonth = null;
Integer versionDay = null;
String source = null;
String alternateSource = null;
Integer versionMajor = null;
Integer versionMinor = null;
Integer versionYear = null;
Integer versionMonth = null;
Integer versionDay = null;
String versionDecorator = null;
String source = null;
String alternateSource = null;

// Get the info from our README file:
String line = reader.readLine();
while ((line != null) && ((version == null) || (source == null) || (alternateSource == null))) {

if (version == null && line.startsWith(README_VERSION_LINE_START)) {
final Matcher m = VERSION_PATTERN.matcher(line);
if ( m.matches() ) {
versionMajor = Integer.valueOf(m.group(1));
versionMinor = Integer.valueOf(m.group(2));
versionYear = Integer.valueOf(m.group(3));
versionMonth = Integer.valueOf(m.group(4));
versionDay = Integer.valueOf(m.group(5));
if (version == null && line.startsWith(MANIFEST_VERSION_LINE_START)) {
final Matcher matcher = VERSION_PATTERN.matcher(line);
if ( matcher.matches() ) {
versionMajor = Integer.valueOf(matcher.group(1));
versionMinor = Integer.valueOf(matcher.group(2));
versionYear = Integer.valueOf(matcher.group(3));
versionMonth = Integer.valueOf(matcher.group(4));
versionDay = Integer.valueOf(matcher.group(5));
versionDecorator = matcher.group(6);

version = versionMajor + "." + versionMinor + "." + versionYear + "" + versionMonth + "" + versionDay;
}
Expand All @@ -512,7 +517,7 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {
}
}

if (source == null && line.startsWith(README_SOURCE_LINE_START)) {
if (source == null && line.startsWith(MANIFEST_SOURCE_LINE_START)) {
final Matcher m = SOURCE_PATTERN.matcher(line);
if ( m.matches() ) {
source = m.group(1);
Expand All @@ -522,7 +527,7 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {
}
}

if (alternateSource == null && line.startsWith(README_ALT_SOURCE_LINE_START)) {
if (alternateSource == null && line.startsWith(MANIFEST_ALT_SOURCE_LINE_START)) {
final Matcher m = ALT_SOURCE_PATTERN.matcher(line);
if ( m.matches() ) {
alternateSource = m.group(1);
Expand All @@ -537,7 +542,7 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {

// Make sure we have good info:
if ( version == null ) {
logger.warn("Unable to read version information from data sources info/readme file: " + readmePath.toUri().toString());
logger.warn("Unable to read version information from data sources info/readme file: " + manifestPath.toUri().toString());
}
else {
logger.info("Data sources version: " + version);
Expand All @@ -547,25 +552,25 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {
}

if ( source == null ) {
logger.warn("Unable to read source information from data sources info/readme file: " + readmePath.toUri().toString());
logger.warn("Unable to read source information from data sources info/readme file: " + manifestPath.toUri().toString());
}
else {
logger.info("Data sources source: " + source);
}

if ( alternateSource == null ) {
logger.warn("Unable to read alternate source information from data sources info/readme file: " + readmePath.toUri().toString());
logger.warn("Unable to read alternate source information from data sources info/readme file: " + manifestPath.toUri().toString());
}
else {
logger.info("Data sources alternate source: " + alternateSource);
}
}
catch (final Exception ex) {
logger.warn("Could not read " + README_FILE_NAME + ": unable to log data sources version information.", ex);
logger.warn("Could not read " + MANIFEST_FILE_NAME + ": unable to log data sources version information.", ex);
}
}
else {
logger.warn("Could not read " + README_FILE_NAME + ": unable to log data sources version information.");
logger.warn("Could not read " + MANIFEST_FILE_NAME + ": unable to log data sources version information.");
}

// Warn the user if they need newer stuff.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package org.broadinstitute.hellbender.tools.funcotator.dataSources;

import org.apache.commons.lang.RandomStringUtils;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;

/**
* Test class for {@link DataSourceUtils}
Expand All @@ -23,11 +26,8 @@ public class DataSourceUtilsUnitTest extends GATKBaseTest {
//==================================================================================================================
// Helper Methods:

//==================================================================================================================
// Data Providers:
private List<Object[]> createBaseTestVersionData() {

@DataProvider
private Iterator<Object[]> provideForValidateVersionInformation() {
final ArrayList<Object[]> testArgs = new ArrayList<>();

final ArrayList<Integer> baseArgs = new ArrayList<>();
Expand Down Expand Up @@ -65,7 +65,44 @@ private Iterator<Object[]> provideForValidateVersionInformation() {
// 1 Year after OK release date, but 1 month and 1 day before OK release date (should pass):
testArgs.add( new Object[] { DataSourceUtils.MIN_MAJOR_VERSION_NUMBER, DataSourceUtils.MIN_MINOR_VERSION_NUMBER, DataSourceUtils.MIN_YEAR_RELEASED+1, DataSourceUtils.MIN_MONTH_RELEASED-1, DataSourceUtils.MIN_DAY_RELEASED-1, true } );

return testArgs;
}

//==================================================================================================================
// Data Providers:

@DataProvider
private Iterator<Object[]> provideForValidateVersionInformation() {
return createBaseTestVersionData().iterator();
}

@DataProvider
private Iterator<Object[]> provideForTestVersionRegex() {

final ArrayList<Object[]> testArgs = new ArrayList<>();

final List<Object[]> baseArgs = createBaseTestVersionData();

for ( final Object[] args : baseArgs ) {
for ( int whitespace = 0; whitespace < 2; ++whitespace ) {
for ( int decoratorCount = 0; decoratorCount < 10; ++decoratorCount ) {

final String whitespaceString = whitespace != 0 ? "\t \t \t " : " ";
final String decoratorString = decoratorCount !=0 ? RandomStringUtils.randomAlphanumeric(decoratorCount) : "";

testArgs.add(
new Object[] {
args[0],args[1],args[2],args[3],args[4],
decoratorString,
whitespaceString
}
);
}
}
}

return testArgs.iterator();

}

//==================================================================================================================
Expand All @@ -88,4 +125,39 @@ public void testValidateVersionInformation(final Integer major,
expected.booleanValue() );
}

@Test(dataProvider = "provideForTestVersionRegex")
public void testVersionRegex(final Integer major,
final Integer minor,
final Integer year,
final Integer month,
final Integer day,
final String decorator,
final String leadingWhitespace ) {

// Construct the string:
final String versionString = String.format(
"%s%s%d.%d.%4d%02d%02d%s",
DataSourceUtils.MANIFEST_VERSION_LINE_START,leadingWhitespace,
major,minor,year,month,day,decorator
);

final Matcher matcher = DataSourceUtils.VERSION_PATTERN.matcher(versionString);

Assert.assertTrue(matcher.matches());

final Integer versionMajor = Integer.valueOf(matcher.group(1));
final Integer versionMinor = Integer.valueOf(matcher.group(2));
final Integer versionYear = Integer.valueOf(matcher.group(3));
final Integer versionMonth = Integer.valueOf(matcher.group(4));
final Integer versionDay = Integer.valueOf(matcher.group(5));
final String versionDecorator = matcher.group(6);

Assert.assertEquals( versionMajor, major );
Assert.assertEquals( versionMinor, minor );
Assert.assertEquals( versionYear, year );
Assert.assertEquals( versionMonth, month );
Assert.assertEquals( versionDay, day );
Assert.assertEquals( versionDecorator, decorator );
}

}