diff --git a/.github/workflows/github-actions-test.yml b/.github/workflows/github-actions-test.yml
index 9a3faa25..9ef153e1 100644
--- a/.github/workflows/github-actions-test.yml
+++ b/.github/workflows/github-actions-test.yml
@@ -16,4 +16,4 @@ jobs:
- name: Test with Maven
run: mvn --update-snapshots test
env:
- GITHUB_TOKEN: ${{ github.token }}
\ No newline at end of file
+ GITHUB_TOKEN: ${{ github.token }}
diff --git a/client-api/pom.xml b/client-api/pom.xml
index 3ac127ed..bd533e67 100644
--- a/client-api/pom.xml
+++ b/client-api/pom.xml
@@ -17,8 +17,6 @@
UTF-8
- 1.7
- 1.7
diff --git a/data/pom.xml b/data/pom.xml
index 85c96dad..e905b45c 100644
--- a/data/pom.xml
+++ b/data/pom.xml
@@ -11,6 +11,8 @@
data
data
+
+
diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java
index 52b0da04..012fb909 100644
--- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java
+++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java
@@ -97,8 +97,6 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws
});
// For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket.
-
- // todo: implement for variant explorer
int indexOfBucket = Collections.binarySearch(bucketList, bucketKey) + 2; //patientBucketCharMasks has bookend bits
for(int x = 0; x < patientIds.size(); x++) {
if(patientMaskForBucket[0].testBit(x)) {
diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoColumnMeta.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoColumnMeta.java
index d0aebb36..189a22be 100644
--- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoColumnMeta.java
+++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoColumnMeta.java
@@ -1,16 +1,5 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;
-import lombok.Builder;
-import lombok.Value;
-import lombok.extern.jackson.Jacksonized;
-
-@Jacksonized
-@Value
-@Builder
-public class InfoColumnMeta {
-
- String key, description;
- boolean continuous;
- Float min, max;
+public record InfoColumnMeta(String key, String description, boolean continuous, Float min, Float max) {
}
diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java
index 86d06472..cca6ee84 100644
--- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java
+++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java
@@ -33,4 +33,6 @@ static VariantMask emptyInstance() {
}
Set patientMaskToPatientIdSet(List patientIds);
+
+ boolean isEmpty();
}
diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java
index 6195abde..fb6b81b8 100644
--- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java
+++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java
@@ -1,6 +1,7 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;
import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.fasterxml.jackson.databind.ser.std.ToStringSerializer;
@@ -75,6 +76,13 @@ public Set patientMaskToPatientIdSet(List patientIds) {
return ids;
}
+ @Override
+ @JsonIgnore
+ public boolean isEmpty() {
+ // because the bitmasks are padded with 11 on each end
+ return bitmask.bitCount() <= 4;
+ }
+
private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) {
return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.or(this.bitmask));
}
diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java
index fb5e2a6a..bf020aee 100644
--- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java
+++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java
@@ -1,5 +1,6 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.math.BigInteger;
@@ -59,6 +60,12 @@ public Set patientMaskToPatientIdSet(List patientIds) {
.collect(Collectors.toSet());
}
+ @Override
+ @JsonIgnore
+ public boolean isEmpty() {
+ return this.patientIndexes.isEmpty();
+ }
+
private VariantMask union(VariantMaskSparseImpl variantMask) {
HashSet union = new HashSet<>(variantMask.patientIndexes);
union.addAll(this.patientIndexes);
diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java
index 1f6cdf0f..ebdb3413 100644
--- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java
+++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java
@@ -1,11 +1,14 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;
import java.io.*;
+import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+import com.google.common.base.Joiner;
import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -21,23 +24,23 @@
* a fast, disk-based backing store.
*/
public class VariantMetadataIndex implements Serializable {
- // todo: make this variable
- public static String VARIANT_METADATA_BIN_FILE = "/opt/local/hpds/all/VariantMetadata.javabin";
+ public static final String VARIANT_METADATA_FILENAME = "VariantMetadata.javabin";
+ public static String VARIANT_METADATA_BIN_FILE = "/opt/local/hpds/all/" + VARIANT_METADATA_FILENAME;
private static final long serialVersionUID = 5917054606643971537L;
private static Logger log = LoggerFactory.getLogger(VariantMetadataIndex.class);
// (String) contig --> (Integer) Bucket --> (String) variant spec --> INFO column data[].
- private Map> > indexMap = new HashMap> >();
+ private final Map> > indexMap = new HashMap<>();
- // todo: make this variable
- private static String fileStoragePrefix = "/opt/local/hpds/all/VariantMetadataStorage";
+ public static final String VARIANT_METADATA_STORAGE_FILE_PREFIX = "VariantMetadataStorage";
+ private static String fileStoragePrefix = "/opt/local/hpds/all/" + VARIANT_METADATA_STORAGE_FILE_PREFIX;
/**
* This map allows us to load millions of variants without re-writing the fbbis each time (which would blow up the disk space).
* We need to remember to flush() between each contig this gets saved to the fbbis array.
*/
- private transient Map> > loadingMap = new HashMap> >();
+ private transient Map> > loadingMap = new HashMap<>();
/**
* This constructor should only be used for testing; we expect the files to be in the default locations in production
@@ -60,7 +63,7 @@ public VariantMetadataIndex() throws IOException {
* @param variantSpec
* @return
*/
- public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder bucketCache) {
+ public Set findBySingleVariantSpec(String variantSpec, VariantBucketHolder bucketCache) {
try {
String[] segments = variantSpec.split(",");
if (segments.length < 2) {
@@ -75,7 +78,7 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<
|| chrOffset != bucketCache.lastChunkOffset) {
FileBackedByteIndexedStorage> ContigFbbis = indexMap.get(contig);
if(ContigFbbis == null) {
- return new String[0];
+ return Set.of();
}
bucketCache.lastValue = ContigFbbis.get(chrOffset);
bucketCache.lastContig = contig;
@@ -85,20 +88,20 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<
if( bucketCache.lastValue != null) {
if(bucketCache.lastValue.get(variantSpec) == null) {
log.warn("No variant data found for spec " + variantSpec);
- return new String[0];
+ return Set.of();
}
- return bucketCache.lastValue.get(variantSpec);
+ return Set.of(bucketCache.lastValue.get(variantSpec));
}
log.warn("No bucket found for spec " + variantSpec + " in bucket " + chrOffset);
- return new String[0];
+ return Set.of();
} catch (UncheckedIOException e) {
log.warn("IOException caught looking up variantSpec : " + variantSpec, e);
- return new String[0];
+ return Set.of();
}
}
- public Map findByMultipleVariantSpec(Collection varientSpecList) {
+ public Map> findByMultipleVariantSpec(Collection varientSpecList) {
// log.debug("SPEC list " + varientSpecList.size() + " :: " + Arrays.deepToString(varientSpecList.toArray()));
VariantBucketHolder bucketCache = new VariantBucketHolder();
@@ -161,7 +164,7 @@ public synchronized void flush() throws IOException {
if(contigFbbis == null) {
log.info("creating new file for " + contig);
String filePath = fileStoragePrefix + "_" + contig + ".bin";
- contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, (Class>)(Class>) ConcurrentHashMap.class, new File(filePath));
+ contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath));
indexMap.put(contig, contigFbbis);
}
@@ -196,13 +199,57 @@ public void complete() throws IOException {
public static VariantMetadataIndex createInstance(String metadataIndexPath) {
try(ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(
- new FileInputStream(metadataIndexPath)))){
- return (VariantMetadataIndex) in.readObject();
+ new FileInputStream(metadataIndexPath + VARIANT_METADATA_FILENAME)))){
+ VariantMetadataIndex variantMetadataIndex = (VariantMetadataIndex) in.readObject();
+ variantMetadataIndex.updateStorageDirectory(new File(metadataIndexPath));
+ return variantMetadataIndex;
} catch(Exception e) {
// todo: handle exceptions better
log.info("No Metadata Index found at " + metadataIndexPath);
- log.debug("Error loading metadata index:", e);
return null;
}
}
+
+ public static void merge(VariantMetadataIndex variantMetadataIndex1, VariantMetadataIndex variantMetadataIndex2, String outputDirectory) throws IOException {
+ VariantMetadataIndex merged = new VariantMetadataIndex(outputDirectory + VARIANT_METADATA_STORAGE_FILE_PREFIX);
+ if (!variantMetadataIndex1.indexMap.keySet().equals(variantMetadataIndex2.indexMap.keySet())) {
+ log.warn("Merging incompatible variant indexes. Index1 keys: " + Joiner.on(",").join(variantMetadataIndex1.indexMap.keySet()) + ". Index 2 keys: " + Joiner.on(",").join(variantMetadataIndex2.indexMap.keySet()));
+ throw new IllegalStateException("Cannot merge variant metadata index with different contig keys");
+ }
+ for (String contig : variantMetadataIndex1.indexMap.keySet()) {
+ String filePath = outputDirectory + VARIANT_METADATA_STORAGE_FILE_PREFIX + "_" + contig + ".bin";
+ FileBackedByteIndexedStorage> mergedFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath));
+
+ // Store the merged result here because FileBackedByteIndexedStorage must be written all at once
+ Map> mergedStagedFbbis = new HashMap<>();
+
+ FileBackedByteIndexedStorage> fbbis1 = variantMetadataIndex1.indexMap.get(contig);
+ FileBackedByteIndexedStorage> fbbis2 = variantMetadataIndex2.indexMap.get(contig);
+
+ fbbis1.keys().forEach(key -> {
+ mergedStagedFbbis.put(key, fbbis1.get(key));
+ });
+ fbbis2.keys().forEach(key -> {
+ ConcurrentHashMap metadataMap = mergedStagedFbbis.get(key);
+ if (metadataMap == null) {
+ mergedStagedFbbis.put(key, fbbis2.get(key));
+ } else {
+ metadataMap.putAll(fbbis2.get(key));
+ }
+ });
+
+ mergedStagedFbbis.forEach(mergedFbbis::put);
+ mergedFbbis.complete();
+ merged.indexMap.put(contig, mergedFbbis);
+ }
+
+ try(ObjectOutputStream out = new ObjectOutputStream(new GZIPOutputStream(Files.newOutputStream(new File(outputDirectory + VARIANT_METADATA_FILENAME).toPath())))){
+ out.writeObject(merged);
+ out.flush();
+ }
+ }
+
+ public void updateStorageDirectory(File genomicDataDirectory) {
+ indexMap.values().forEach(value -> value.updateStorageDirectory(genomicDataDirectory));
+ }
}
\ No newline at end of file
diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java
index c91ebf4d..b62dd496 100644
--- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java
+++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java
@@ -16,20 +16,16 @@
public class VariantStore implements Serializable {
private static final long serialVersionUID = -6970128712587609414L;
+ public static final String VARIANT_STORE_JAVABIN_FILENAME = "variantStore.javabin";
+ public static final String VARIANT_SPEC_INDEX_JAVABIN_FILENAME = "variantSpecIndex.javabin";
private static Logger log = LoggerFactory.getLogger(VariantStore.class);
public static final int BUCKET_SIZE = 1000;
- public static final String VARIANT_SPEC_INDEX_FILE = "variantSpecIndex.javabin";
-
private BigInteger emptyBitmask;
private String[] patientIds;
private transient String[] variantSpecIndex;
- private Integer variantStorageSize;
-
- private String[] vcfHeaders = new String[24];
-
private Map>> variantMaskStorage = new TreeMap<>();
public Map>> getVariantMaskStorage() {
@@ -48,7 +44,7 @@ public void setVariantSpecIndex(String[] variantSpecIndex) {
}
public static VariantStore readInstance(String genomicDataDirectory) throws IOException, ClassNotFoundException {
- ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(genomicDataDirectory + "variantStore.javabin")));
+ ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(genomicDataDirectory + VARIANT_STORE_JAVABIN_FILENAME)));
VariantStore variantStore = (VariantStore) ois.readObject();
ois.close();
variantStore.getVariantMaskStorage().values().forEach(store -> {
@@ -60,14 +56,14 @@ public static VariantStore readInstance(String genomicDataDirectory) throws IOEx
}
public void writeInstance(String genomicDirectory) {
- try (FileOutputStream fos = new FileOutputStream(new File(genomicDirectory, "variantStore.javabin"));
+ try (FileOutputStream fos = new FileOutputStream(new File(genomicDirectory, VARIANT_STORE_JAVABIN_FILENAME));
GZIPOutputStream gzos = new GZIPOutputStream(fos);
ObjectOutputStream oos = new ObjectOutputStream(gzos);) {
oos.writeObject(this);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
- try (FileOutputStream fos = new FileOutputStream(new File(genomicDirectory, "variantSpecIndex.javabin"));
+ try (FileOutputStream fos = new FileOutputStream(new File(genomicDirectory, VARIANT_SPEC_INDEX_JAVABIN_FILENAME));
GZIPOutputStream gzos = new GZIPOutputStream(fos);
ObjectOutputStream oos = new ObjectOutputStream(gzos);) {
oos.writeObject(Arrays.asList(variantSpecIndex));
@@ -76,41 +72,6 @@ public void writeInstance(String genomicDirectory) {
}
}
- public Map countVariants() {
- HashMap countOffsetMap = new HashMap();
- TreeMap counts = new TreeMap<>();
- for (String contig : variantMaskStorage.keySet()) {
- counts.put(contig, new int[5]);
- FileBackedJsonIndexStorage> storage = variantMaskStorage
- .get(contig);
- storage.keys().stream().forEach((Integer key) -> {
- int[] contigCounts = counts.get(contig);
- Collection values = storage.get(key).values();
- contigCounts[0] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> {
- return masks.heterozygousMask != null ? 1 : 0;
- }));
- contigCounts[1] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> {
- return masks.homozygousMask != null ? 1 : 0;
- }));
- contigCounts[2] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> {
- return masks.heterozygousNoCallMask != null ? 1 : 0;
- }));
- contigCounts[3] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> {
- return masks.homozygousNoCallMask != null ? 1 : 0;
- }));
- contigCounts[4] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> {
- return masks.heterozygousMask != null || masks.homozygousMask != null
- || masks.heterozygousNoCallMask != null || masks.homozygousNoCallMask != null ? 1 : 0;
- }));
- });
- }
- return counts;
- }
-
- public String[] getVCFHeaders() {
- return vcfHeaders;
- }
-
public String[] getPatientIds() {
return patientIds;
}
@@ -124,11 +85,6 @@ public Optional getMasks(String variant, VariantBucketHold
int chrOffset = Integer.parseInt(segments[1]) / BUCKET_SIZE;
String contig = segments[0];
-// if (Level.DEBUG.equals(log.getEffectiveLevel())) {
-// log.debug("Getting masks for variant " + variant + " Same bucket test: " + (bucketCache.lastValue != null
-// && contig.contentEquals(bucketCache.lastContig) && chrOffset == bucketCache.lastChunkOffset));
-// }
-
if (bucketCache.lastValue != null && contig.contentEquals(bucketCache.lastContig)
&& chrOffset == bucketCache.lastChunkOffset) {
// TODO : This is a temporary efficiency hack, NOT THREADSAFE!!!
@@ -144,9 +100,26 @@ public Optional getMasks(String variant, VariantBucketHold
}
return bucketCache.lastValue == null ? Optional.empty() : Optional.ofNullable(bucketCache.lastValue.get(variant));
}
+ public List getMasksForDbSnpSpec(String variant) {
+ String[] segments = variant.split(",");
+ if (segments.length < 2) {
+ log.error("Less than 2 segments found in this variant : " + variant);
+ }
- public String[] getHeaders() {
- return vcfHeaders;
+ int chrOffset = Integer.parseInt(segments[1]) / BUCKET_SIZE;
+ String contig = segments[0];
+
+ // todo: don't bother doing a lookup if this node does not have the chromosome specified
+ FileBackedJsonIndexStorage> indexedStorage = variantMaskStorage.get(contig);
+ if (indexedStorage == null) {
+ return List.of();
+ } else {
+ ConcurrentHashMap specToMaskMap = indexedStorage.get(chrOffset);
+ return specToMaskMap.entrySet().stream()
+ .filter(entry -> entry.getKey().startsWith(variant))
+ .map(Map.Entry::getValue)
+ .collect(Collectors.toList());
+ }
}
public void open() {
@@ -169,34 +142,6 @@ public void setPatientIds(String[] patientIds) {
this.patientIds = patientIds;
}
- public int getVariantStorageSize() {
- return variantStorageSize;
- }
-
- public void setVariantStorageSize(int variantStorageSize) {
- this.variantStorageSize = variantStorageSize;
- }
-
- public List getMasksForRangesOfChromosome(String contigForGene, List offsetsForGene,
- RangeSet rangeSetsForGene) throws IOException {
- FileBackedJsonIndexStorage masksForChromosome = variantMaskStorage.get(contigForGene);
- Set bucketsForGene = offsetsForGene.stream().map((offset) -> {
- return offset / BUCKET_SIZE;
- }).collect(Collectors.toSet());
- List masks = new ArrayList();
- for (Integer bucket : bucketsForGene) {
- Map variantMaskBucket = (Map) masksForChromosome.get(bucket);
- variantMaskBucket.keySet().stream().filter((String spec) -> {
- int offsetForVariant = Integer.parseInt(spec.split(",")[1]);
- return rangeSetsForGene.contains(offsetForVariant);
- }).forEach((spec) -> {
- System.out.println(spec);
- masks.add(variantMaskBucket.get(spec));
- });
- }
- return masks;
- }
-
public BigInteger emptyBitmask() {
if (emptyBitmask == null || emptyBitmask.testBit(emptyBitmask.bitLength() / 2)) {
String emptyVariantMask = "";
@@ -208,19 +153,16 @@ public BigInteger emptyBitmask() {
return emptyBitmask;
}
+ @SuppressWarnings("unchecked")
public static String[] loadVariantIndexFromFile(String genomicDataDirectory) {
- try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(genomicDataDirectory + "/" + VARIANT_SPEC_INDEX_FILE)));){
+ try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(genomicDataDirectory + "/" + VARIANT_SPEC_INDEX_JAVABIN_FILENAME)));){
List variants = (List) objectInputStream.readObject();
return variants.toArray(new String[0]);
- } catch (FileNotFoundException e) {
- throw new RuntimeException(e);
- } catch (IOException e) {
- throw new RuntimeException(e);
- } catch (ClassNotFoundException e) {
+ } catch (IOException | ClassNotFoundException e) {
throw new RuntimeException(e);
}
- }
+ }
}
diff --git a/etl/pom.xml b/etl/pom.xml
index 2d19f3f1..b388883e 100644
--- a/etl/pom.xml
+++ b/etl/pom.xml
@@ -330,6 +330,26 @@
single
+
+ GenomicDatasetFinalizer
+
+
+
+ edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.GenomicDatasetFinalizer
+
+
+ ${project.basedir}/../docker/pic-sure-hpds-etl
+
+ jar-with-dependencies
+
+ GenomicDatasetFinalizer
+ GenomicDatasetFinalizer
+
+ package
+
+ single
+
+
diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetFinalizer.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetFinalizer.java
new file mode 100644
index 00000000..0181b2cb
--- /dev/null
+++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetFinalizer.java
@@ -0,0 +1,21 @@
+package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype;
+
+import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.BucketIndexBySample;
+import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore;
+
+import java.io.IOException;
+
+public class GenomicDatasetFinalizer {
+ private static String genomicDirectory;
+
+ public static void main(String[] args) throws IOException, ClassNotFoundException {
+ if (args.length != 2) {
+ throw new IllegalArgumentException("2 arguments must be provided: source directory 1, source directory 2, output directory");
+ }
+ genomicDirectory = args[0];
+ String outputDirectory = args[1];
+
+ VariantStore variantStore = VariantStore.readInstance(genomicDirectory);
+ BucketIndexBySample bucketIndexBySample = new BucketIndexBySample(variantStore, genomicDirectory);
+ }
+}
diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java
index ba3b2936..0d8af0df 100644
--- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java
+++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java
@@ -1,10 +1,8 @@
package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype;
import com.google.common.base.Preconditions;
-import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore;
-import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks;
-import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks;
-import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore;
+import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*;
+import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage;
import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -24,7 +22,6 @@ public class GenomicDatasetMergerRunner {
private static Logger log = LoggerFactory.getLogger(GenomicDatasetMerger.class);
public static final String INFO_STORE_JAVABIN_SUFFIX = "infoStore.javabin";
- public static final String VARIANT_SPEC_INDEX_FILENAME = "variantSpecIndex.javabin";
private static String genomicDirectory1;
private static String genomicDirectory2;
@@ -56,6 +53,11 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio
variantIndexes.values().forEach(variantIndex -> {
variantIndex.write(new File(outputDirectory + variantIndex.column_key + "_" + INFO_STORE_JAVABIN_SUFFIX));
});
+
+ VariantMetadataIndex variantMetadataIndex1 = VariantMetadataIndex.createInstance(genomicDirectory1);
+ VariantMetadataIndex variantMetadataIndex2 = VariantMetadataIndex.createInstance(genomicDirectory2);
+
+ VariantMetadataIndex.merge(variantMetadataIndex1, variantMetadataIndex2, outputDirectory);
}
private static Map loadInfoStores(String directory) {
diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantMetadataLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantMetadataLoader.java
index 3dd3e7f4..97d897fd 100644
--- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantMetadataLoader.java
+++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantMetadataLoader.java
@@ -38,12 +38,12 @@ public class VariantMetadataLoader {
GZIP_FLAG_COLUMN=3,
FILE_COLUMN = 0;
- public static void main(String[] args) throws Exception{
+ public static void main(String[] args) throws IOException {
File vcfIndexFile;
log.info(new File(".").getAbsolutePath());
if(args.length > 0 && new File(args[0]).exists()) {
- log.info("using path from command line, is this a test");
+ log.info("using path from command line");
vcfIndexFile = new File(args[0]);
variantIndexPathForTests = args[1];
storagePathForTests = args[2];
diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/CSVLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/CSVLoader.java
index e254b72b..5a1e236f 100644
--- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/CSVLoader.java
+++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/phenotype/CSVLoader.java
@@ -38,12 +38,12 @@ public static void main(String[] args) throws IOException {
HPDS_DIRECTORY = args[0] + "/";
}
store.allObservationsStore = new RandomAccessFile(HPDS_DIRECTORY + "allObservationsStore.javabin", "rw");
- initialLoad();
+ initialLoad(HPDS_DIRECTORY);
store.saveStore(HPDS_DIRECTORY);
}
- private static void initialLoad() throws IOException {
- Crypto.loadDefaultKey();
+ private static void initialLoad(String hpdsDirectory) throws IOException {
+ Crypto.loadKey(Crypto.DEFAULT_KEY_NAME, hpdsDirectory + "encryption_key");
Reader in = new FileReader(HPDS_DIRECTORY + "allConcepts.csv");
Iterable records = CSVFormat.DEFAULT.withSkipHeaderRecord().withFirstRecordAsHeader().parse(new BufferedReader(in, 1024*1024));
diff --git a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySampleTest.java b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySampleTest.java
deleted file mode 100644
index d6aa3f79..00000000
--- a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySampleTest.java
+++ /dev/null
@@ -1,369 +0,0 @@
-package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-
-import edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.NewVCFLoader;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-import org.springframework.test.context.event.annotation.BeforeTestClass;
-
-import static org.springframework.test.util.AssertionErrors.*;
-
-/**
- * These tests are in the ETL project so that we can read in data from disk each time instead of storing binfiles
- * that may become outdated.
- *
- * the BucketIndexBySample.filterVariantSetForPatientSet method removes variants base on the patent BUCKET MASK; just because a patient
- * does not have a particular variant doesn't mean it will be filtered out e.g., when a patient has a different variant in the same bucket.
- *
- * Filtering the specific variants is typically done by the calling function after filtering out the unneeded buckets.
- *
- * @author nchu
- *
- */
-public class BucketIndexBySampleTest {
-
- private static final String VCF_INDEX_FILE = "./src/test/resources/bucketIndexBySampleTest_vcfIndex.tsv";
- private static final String STORAGE_DIR = "./target/";
- private static final String MERGED_DIR = "./target/merged/";
-
- private static VariantStore variantStore;
- private static BucketIndexBySample bucketIndexBySample;
-
- //Some known variant specs from the input file.
- private static final String spec1 = "4,9856624,CAAAAA,C";
- private static final String spec2 = "4,9856624,CAAA,C";
- private static final String spec3 = "4,9856624,CA,C";
- private static final String spec4 = "4,9856624,C,CA";
- private static final String spec5 = "4,9856624,CAAAAA,CA";
-
- private static final String spec6 = "14,19000060,C,G";
- private static final String spec7 = "14,19000152,C,T";
- private static final String spec8 = "14,19007733,C,T";
- private static final String spec9 = "14,19010456,T,G";
- private static final String spec10 = "14,21616875,T,C"; // patient 9 and 10 are 1/.
- private static final String spec11 = "14,19001521,T,C"; //patient 9 and 10 are 0/.
- private static final String spec12 = "14,19022646,A,G"; //patient 7 is ./.
-
-// ## Patient 1 - NO variants
-// ## Patient 2 - ALL variants
-// ## Patient 3 - NO CHR 14 variants, ALL CHR 4 variants
-// ## Patient 4 - ALL CHR 14 variants, NO CHR 4 variants
-// ## others mixed
-// patient 5 has spec 1 and 5
-// patient 6 has spec 4 and 5
-//
-// For no call variants - ./1 1/. count yes, ./0 0/. count NO
-
-
- //these parameters to the BucketIndexBySample methods are configured by each test
- Set variantSet;
- List patientSet;
-
- @BeforeAll
- public static void initializeBinfile() throws Exception {
- //load variant data
- NewVCFLoader.main(new String[] {VCF_INDEX_FILE, STORAGE_DIR, MERGED_DIR});
-
- VariantStore variantStore = VariantStore.readInstance(STORAGE_DIR);
-
- //now use that object to initialize the BucketIndexBySample object
- bucketIndexBySample = new BucketIndexBySample(variantStore, STORAGE_DIR);
-// bucketIndexBySample.printPatientMasks();
- }
-
- @BeforeEach
- public void setUpTest() {
- //start with fresh, empty collections
- variantSet = new HashSet();
- patientSet = new ArrayList();
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_noPatients() throws IOException {
- variantSet.add(spec1);
- variantSet.add(spec2);
- variantSet.add(spec3);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertTrue("Empty Patient List should filter out all variants", filteredVariantSet.isEmpty());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_noVariants() throws IOException {
- patientSet.add(1);
- patientSet.add(2);
- patientSet.add(3);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertTrue("Empty Variant Set should remain empty", filteredVariantSet.isEmpty());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_VariantsWithoutPatientsLastBucket() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_VariantsWithoutPatientsLastBucket");
-
- variantSet.add(spec5);
-
- patientSet.add(1);
- patientSet.add(4);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertTrue("Patients should not match any variants in the list", filteredVariantSet.isEmpty());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_PatientsWithNoVariantsFirstBucket() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_PatientsWithNoVariantsFirstBucket");
-
- variantSet.add(spec7);
- variantSet.add(spec8);
-
- patientSet.add(1);
- patientSet.add(3);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertTrue("Patients should not match any variants in the list", filteredVariantSet.isEmpty());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_allValidLastBucket() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_allValidLastBucket");
-
- //specs 1-5 are in the last bucket
- variantSet.add(spec1);
- variantSet.add(spec4);
- variantSet.add(spec5);
-
- patientSet.add(2);
- patientSet.add(4);
- patientSet.add(5);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("No variants should be filtered out", (long)3, (long)filteredVariantSet.size());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_allValidFirstBucket() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_allValidFirstBucket");
-
- //specs 1-5 are in the last bucket
- variantSet.add(spec6);
- variantSet.add(spec7);
- variantSet.add(spec8);
-
- patientSet.add(2);
- patientSet.add(3);
- patientSet.add(5);
- patientSet.add(6);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("No variants should be filtered out", (long)3, (long)filteredVariantSet.size());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_someValid() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_someValid");
-
- //specs 1-5 are in the last bucket
- variantSet.add(spec1);
- variantSet.add(spec6);
-
- patientSet.add(1);
- patientSet.add(3);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("One variant should be filtered out", (long)1, (long)filteredVariantSet.size());
- assertTrue("Expected variant not found", filteredVariantSet.contains(spec1));
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_allValidDifferentPatients() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_allValidDifferentPatients");
-
- //specs 1-5 are in the last bucket
- variantSet.add(spec1); // only 5
- variantSet.add(spec4); // only 6
- variantSet.add(spec5); // 5 & 6
- variantSet.add(spec7); // only #4
-
- patientSet.add(4);
- patientSet.add(5);
- patientSet.add(6);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("No variants should be filtered out", (long)4, (long)filteredVariantSet.size());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_someValidDifferentPatients() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_allValidDifferentPatients");
-
- //specs 1-5 are in the last bucket
- variantSet.add(spec1);
- variantSet.add(spec4);
- variantSet.add(spec5);
- variantSet.add(spec8);
- variantSet.add(spec9); //none
-
- patientSet.add(3);
- patientSet.add(9);
- patientSet.add(10);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("One variant should be filtered out", (long)4, (long)filteredVariantSet.size());
- assertFalse("Spec 9 should have been filtered out", filteredVariantSet.contains(spec9));
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroPatientA() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroPatientA");
-
- variantSet.add(spec8); //patients 7 and 8 have hetero flags for this variant (1|0 and 0|1)
-
- patientSet.add(7);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("No variants should be filtered out", (long)1, (long)filteredVariantSet.size());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroPatientB() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroPatientB");
-
- variantSet.add(spec8); //patients 7 and 8 have hetero flags for this variant (1|0 and 0|1)
-
- patientSet.add(8);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("No variants should be filtered out", (long)1, (long)filteredVariantSet.size());
- }
-
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroNoCallPosPatientA() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroNoCallPosPatientA");
-
- variantSet.add(spec8); //patients 9 and 10 have hetero No Call flags for this variant (1|. and .|1)
-
- patientSet.add(9);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("No variants should be filtered out", (long)1, (long)filteredVariantSet.size());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroNoCallPosPatientB() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroNoCallPosPatientB");
-
- variantSet.add(spec8); //patients 9 and 10 have hetero No Call flags for this variant (1|. and .|1)
-
- patientSet.add(10);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("No variants should be filtered out", (long)1, (long)filteredVariantSet.size());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroNoCallNegPatientA() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroNoCallNegPatientA");
-
- variantSet.add(spec10); //patients 9 and 10 have hetero No Call flags for this variant (0|. and .|0)
-
- patientSet.add(9);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertTrue("All variants should be filtered out", filteredVariantSet.isEmpty());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroNoCallNegPatientB() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroNoCallNegPatientB");
-
- variantSet.add(spec10); //patients 9 and 10 have hetero No Call flags for this variant (0|. and .|0)
-
- patientSet.add(10);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertTrue("All variants should be filtered out", filteredVariantSet.isEmpty());
- }
-
- @Test
- public void test_filterVariantSetForPatientSet_HomoNoCall() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroNoCallNegPatientB");
-
- variantSet.add(spec12);
- patientSet.add(7);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertTrue("All variants should be filtered out", filteredVariantSet.isEmpty());
- }
-
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroNoCallMultipleVariantsAndPatientsA() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroNoCallMultipleVariantsAndPatientss");
-
- variantSet.add(spec10); //patients 9 and 10 have hetero No Call flags for this variant (0|. and .|0) (#7 has this)
- variantSet.add(spec8); //patients 9 and 10 have hetero No Call flags for this variant (1|. and .|1)
- variantSet.add(spec4); // 9 and 10 have a spec in this bucket
- variantSet.add(spec11); //9 and 10 should not have this spec
- variantSet.add(spec12);
-
- patientSet.add(1); //no specs
- patientSet.add(9);
- patientSet.add(10);
- patientSet.add(7);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("Two variant should be filtered out", (long)3, (long)filteredVariantSet.size());
- assertFalse("Spec 12 should have been filtered out", filteredVariantSet.contains(spec12));
- assertFalse("Spec 11 should have been filtered out", filteredVariantSet.contains(spec11));
- }
-
-
- @Test
- public void test_filterVariantSetForPatientSet_HeteroNoCallMultipleVariantsAndPatientsB() throws IOException {
- System.out.println("test_filterVariantSetForPatientSet_HeteroNoCallMultipleVariantsAndPatientss");
-
- variantSet.add(spec10); //patients 9 and 10 have hetero No Call flags for this variant (0|. and .|0)
- variantSet.add(spec8); //patients 9 and 10 have hetero No Call flags for this variant (1|. and .|1)
- variantSet.add(spec4); // 9 and 10 have a spec in this bucket
- variantSet.add(spec11); //9 and 10 should not have this spec
-
- patientSet.add(1); //no specs
- patientSet.add(9);
- patientSet.add(10);
-
- Collection filteredVariantSet = bucketIndexBySample.filterVariantSetForPatientSet(variantSet, patientSet);
-
- assertEquals("Two variant should be filtered out", (long)2, (long)filteredVariantSet.size());
- assertFalse("Spec 10 should have been filtered out", filteredVariantSet.contains(spec10));
- assertFalse("Spec 11 should have been filtered out", filteredVariantSet.contains(spec11));
- }
-
-}
diff --git a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java
deleted file mode 100644
index 58e1d48a..00000000
--- a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java
+++ /dev/null
@@ -1,183 +0,0 @@
-package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;
-
-import java.io.*;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.zip.GZIPInputStream;
-
-import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder;
-import edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.VariantMetadataLoader;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-import org.springframework.test.context.event.annotation.BeforeTestClass;
-
-import static org.junit.jupiter.api.Assertions.assertThrows;
-import static org.springframework.test.util.AssertionErrors.assertEquals;
-import static org.springframework.test.util.AssertionErrors.assertNotNull;
-
-// todo: enable when variant explorer is implemented
-@Disabled
-public class VariantMetadataIndexTest {
-
- //From file 1 14 19038291 rs550062154 C A
- //From file 2 14 21089541 rs543976440 A G
- //From file 3 14 21616876 rs549724318 G A
-
-
- /**
- * The metadataIndex is non-mutable (or should be) so we only need one object to test
- */
- private static VariantMetadataIndex vmi;
- public static String binFile = "target/VariantMetadata.javabin";
- VariantBucketHolder bucketCache = new VariantBucketHolder();
-
- //Some known variant specs from the input file. These have been designed for testing partially overlapping specs
- private static final String spec1 = "4,9856624,CAAAAA,C"; private static final String spec1Info = "AC=401;AF=8.00719e-02;NS=2504;AN=5008;EAS_AF=3.37000e-02;EUR_AF=4.97000e-02;AFR_AF=1.64100e-01;AMR_AF=3.75000e-02;SAS_AF=7.57000e-02;DP=18352;AA=G|||;VT=SNP";
- private static final String spec2 = "4,9856624,CAAA,C"; private static final String spec2Info = "AC=62;AF=1.23802e-02;NS=2504;AN=5008;EAS_AF=0.00000e+00;EUR_AF=1.00000e-03;AFR_AF=4.54000e-02;AMR_AF=1.40000e-03;SAS_AF=0.00000e+00;DP=18328;AA=T|||;VT=SNP";
- private static final String spec3 = "4,9856624,CA,C"; private static final String spec3Info = "AC=8;AF=1.59744e-03;NS=2504;AN=5008;EAS_AF=0.00000e+00;EUR_AF=0.00000e+00;AFR_AF=6.10000e-03;AMR_AF=0.00000e+00;SAS_AF=0.00000e+00;DP=18519;AA=T|||;VT=SNP";
- private static final String spec4 = "4,9856624,C,CA"; private static final String spec4Info = "AC=75;AF=1.49760e-02;NS=2504;AN=5008;EAS_AF=3.27000e-02;EUR_AF=2.49000e-02;AFR_AF=6.80000e-03;AMR_AF=4.30000e-03;SAS_AF=5.10000e-03;DP=18008;AA=A|||;VT=SNP";
- private static final String spec5 = "4,9856624,CAAAAA,CA"; private static final String spec5Info = "AC=3033;AF=6.05631e-01;NS=2504;AN=5008;EAS_AF=5.23800e-01;EUR_AF=7.54500e-01;AFR_AF=4.28900e-01;AMR_AF=7.82400e-01;SAS_AF=6.50300e-01;DP=20851;VT=INDEL";
-
-
- @BeforeAll
- public static void initializeBinfile() throws Exception {
- VariantMetadataLoader.main(new String[] {"./src/test/resources/test_vcfIndex.tsv", binFile, "target/VariantMetadataStorage.bin"});
-
- if(new File(binFile).exists()) {
- try(ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new FileInputStream(binFile)))){
- vmi = (VariantMetadataIndex) in.readObject();
- }catch(Exception e) {
- e.printStackTrace();
- }
- }
- }
-
- @Test
- public void test_2a_variantFromFile_1_WasLoaded() {
- String[] data = vmi.findBySingleVariantSpec("14,19038291,C,A", bucketCache);
- String[] expecteds = {"AC=14;AF=2.79553e-03;NS=2504;AN=5008;EAS_AF=0.00000e+00;EUR_AF=1.09000e-02;AFR_AF=0.00000e+00;AMR_AF=4.30000e-03;SAS_AF=0.00000e+00;DP=32694;AA=.|||;VT=SNP"};
- assertEquals("The expected values were not found.", expecteds, data);
- }
-
- @Test
- public void test_2b_variantFromFile_2_WasLoaded() {
- String[] data = vmi.findBySingleVariantSpec("14,21089541,A,G", bucketCache);
- String[] expecteds = {"AC=20;AF=3.99361e-03;NS=2504;AN=5008;EAS_AF=0.00000e+00;EUR_AF=0.00000e+00;AFR_AF=1.44000e-02;AMR_AF=1.40000e-03;SAS_AF=0.00000e+00;DP=18507;AA=A|||;VT=SNP"};
- assertEquals("The expected values were not found.", expecteds, data);
- }
-
- @Test
- public void test_2c_variantFromFile_3_WasNotLoaded() {
- String[] data = vmi.findBySingleVariantSpec("14,21616876,G,A", bucketCache);
- String[] expecteds = {};
- assertEquals("The expected values were not found.", expecteds, data);
- }
-
- @Test
- public void test_4_MultipleVariantSpec() {
- List variants = List.of("14,19038291,C,A", "14,21089541,A,G");
- Map expectedResult = Map.of(
- "14,19038291,C,A"
- , new String[]{"AC=14;AF=2.79553e-03;NS=2504;AN=5008;EAS_AF=0.00000e+00;EUR_AF=1.09000e-02;AFR_AF=0.00000e+00;AMR_AF=4.30000e-03;SAS_AF=0.00000e+00;DP=32694;AA=.|||;VT=SNP"}
- ,"14,21089541,A,G"
- ,new String[]{"AC=20;AF=3.99361e-03;NS=2504;AN=5008;EAS_AF=0.00000e+00;EUR_AF=0.00000e+00;AFR_AF=1.44000e-02;AMR_AF=1.40000e-03;SAS_AF=0.00000e+00;DP=18507;AA=A|||;VT=SNP"});
- Map[] data = new Map[] {vmi.findByMultipleVariantSpec(variants)};
-
- assertEquals("Wrong number of records in response.", data[0].size(), 2);
- variants.stream().forEach(variant->{
- assertEquals("The expected values were not found.", expectedResult.get(variant), data[0].get(variant));
- });
-
- Map[] data2 = new Map[] {vmi.findByMultipleVariantSpec(variants.subList(0, 1))};
-
- assertEquals("Wrong number of records in response.", 1, data2[0].size());
- assertEquals("The expected values were not found.", expectedResult.get(variants.get(0)), data2[0].get(variants.get(0)));
-
- }
-
- @Test
- public void testMultipleVariantSpecSamePOS() {
-
- List variants = List.of(spec1, spec4);
- Map expectedResult = Map.of(
- spec1, new String[]{spec1Info},
- spec4, new String[]{spec4Info});
- Map[] data = new Map[] {vmi.findByMultipleVariantSpec(variants)};
-
- assertEquals("Wrong number of records in response.", data[0].size(), 2);
- variants.stream().forEach(variant->{
- assertEquals("The expected values were not found.", expectedResult.get(variant), data[0].get(variant));
- });
- }
-
- @Test
- public void testMultipleVariantSpecSamePOSAndREF() {
- List variants = List.of(spec1, spec5);
- Map expectedResult = Map.of(
- spec1, new String[]{spec1Info},
- spec5, new String[]{spec5Info});
- Map[] data = new Map[] {vmi.findByMultipleVariantSpec(variants)};
-
- assertEquals("Wrong number of records in response.", data[0].size(), 2);
- variants.stream().forEach(variant->{
- assertEquals("The expected values were not found.", expectedResult.get(variant), data[0].get(variant));
- });
- }
-
- @Test
- public void testMultipleVariantSpecSamePOSAndALT() {
- List variants = List.of(spec1, spec2);
- Map expectedResult = Map.of(
- spec1, new String[]{spec1Info},
- spec2, new String[]{spec2Info});
- Map[] data = new Map[] {vmi.findByMultipleVariantSpec(variants)};
-
- assertEquals("Wrong number of records in response.", data[0].size(), 2);
- variants.stream().forEach(variant->{
- assertEquals("The expected values were not found.", expectedResult.get(variant), data[0].get(variant));
- });
- }
-
- /**
- * The google API that we use throws an IllegalStateException on duplicate entries
- */
- @Test
- public void testMultipleVariantSpecSameSpec() {
- assertThrows(IllegalStateException.class, () -> {
- List variants = List.of(spec1, spec1);
- Map expectedResult = Map.of(
- spec1, new String[]{spec1Info});
- Map[] data = new Map[] {vmi.findByMultipleVariantSpec(variants)};
-
- assertEquals("Wrong number of records in response.", data[0].size(), 1);
- variants.stream().forEach(variant->{
- assertEquals("The expected values were not found.", expectedResult.get(variant), data[0].get(variant));
- });
- });
- }
-
- @Test
- public void testVariantSpecMapSorting() {
- Map specMap = Map.of(
- spec1, new String[]{spec1Info},
- spec2, new String[]{spec2Info});
-
- TreeMap metadataSorted = new TreeMap<>((o1, o2) -> {
- return new VariantSpec(o1).compareTo(new VariantSpec(o2));
- });
- metadataSorted.putAll(specMap);
-
- assertEquals("Wrong number of records in response.", metadataSorted.size(), 2);
- assertNotNull("spec1 value not present in the sorted map", metadataSorted.get(spec1));
- assertEquals("Incorrect spec1 value in the sorted map", spec1Info, metadataSorted.get(spec1)[0]);
- assertNotNull("spec2 value not present in the sorted map", metadataSorted.get(spec2));
- assertEquals("Incorrect spec2 value in the sorted map", spec2Info, metadataSorted.get(spec2)[0]);
-
-
- }
-
-
-
-}
diff --git a/etl/src/test/resources/bucketIndexBySampleTest_vcfIndex.tsv b/etl/src/test/resources/bucketIndexBySampleTest_vcfIndex.tsv
deleted file mode 100755
index e5ca3a23..00000000
--- a/etl/src/test/resources/bucketIndexBySampleTest_vcfIndex.tsv
+++ /dev/null
@@ -1,2 +0,0 @@
-filename chromosome annotated gzip sample_ids patient_ids sample_relationship related_sample_ids
-src/test/resources/BucketIndexTestInput.vcf ALL 1 0 HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,HG00103,HG00105,HG00106,HG00107 1,2,3,4,5,6,7,8,9,10
diff --git a/etl/src/test/resources/log4j.properties b/etl/src/test/resources/log4j.properties
deleted file mode 100644
index 5e798180..00000000
--- a/etl/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,9 +0,0 @@
-# Root logger option
-log4j.rootLogger=INFO, stdout
-
-# Direct log messages to stdout
-log4j.appender.stdout=org.apache.log4j.ConsoleAppender
-log4j.appender.stdout.Target=System.out
-log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
-
\ No newline at end of file
diff --git a/etl/src/test/resources/test.vcf b/etl/src/test/resources/test.vcf
deleted file mode 100644
index f2e23e3b..00000000
--- a/etl/src/test/resources/test.vcf
+++ /dev/null
@@ -1,1254 +0,0 @@
-##fileformat=VCFv4.2
-##hailversion=0.2.18-08ec699f0fd4
-##FILTER=
-##fileDate=20150218
-##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
-##source=1000GenomesPhase3Pipeline
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##contig=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=
-##ALT=