From ecc154615e77a5b188518da2b3ff7822d0c645be Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Tue, 6 Feb 2024 10:05:24 -0500 Subject: [PATCH 01/35] ALS-5819: Add logging to determine patients per variant --- .../avillach/hpds/processing/PatientVariantJoinHandler.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java index 5cc66fd3..0749ed27 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java @@ -78,9 +78,9 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient BigInteger heteroMask = masks.heterozygousMask == null ? variantService.emptyBitmask() : masks.heterozygousMask; BigInteger homoMask = masks.homozygousMask == null ? variantService.emptyBitmask() : masks.homozygousMask; BigInteger orMasks = heteroMask.or(homoMask); - BigInteger andMasks = orMasks.and(patientsInScopeMask); + log.info("Patients with variant " + variantSpec + ": " + (orMasks.bitCount())); synchronized(matchingPatients) { - matchingPatients[0] = matchingPatients[0].or(andMasks); + matchingPatients[0] = matchingPatients[0].or(orMasks); } }, () -> missingVariants.add(variantSpec)); }); @@ -90,7 +90,7 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient } }); } - return matchingPatients[0]; + return matchingPatients[0].and(patientsInScopeMask); }else { log.error("No matches found for info filters."); return createMaskForPatientSet(new HashSet<>()); From a42401dbe8eb94b8f17c9f131e4768becc12407c Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Tue, 6 Feb 2024 10:09:06 -0500 Subject: [PATCH 02/35] ALS-5819: Add logging to determine patients per variant --- .../avillach/hpds/processing/PatientVariantJoinHandler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java index 0749ed27..9870e671 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java @@ -78,7 +78,7 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient BigInteger heteroMask = masks.heterozygousMask == null ? variantService.emptyBitmask() : masks.heterozygousMask; BigInteger homoMask = masks.homozygousMask == null ? variantService.emptyBitmask() : masks.homozygousMask; BigInteger orMasks = heteroMask.or(homoMask); - log.info("Patients with variant " + variantSpec + ": " + (orMasks.bitCount())); + log.info("Patients with variant " + variantSpec + ": " + (orMasks.bitCount() - 4)); synchronized(matchingPatients) { matchingPatients[0] = matchingPatients[0].or(orMasks); } From 0f9520c05110bcef6b159488c39528e7adaec31b Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 12 Feb 2024 09:54:27 -0500 Subject: [PATCH 03/35] In-process work for adding sparse variant masks. NOT TESTED --- .../data/genotype/VariableVariantMasks.java | 265 ++++++++++++++++++ .../hpds/data/genotype/VariantMask.java | 31 ++ .../data/genotype/VariantMaskBitmaskImpl.java | 59 ++++ .../data/genotype/VariantMaskSparseImpl.java | 57 ++++ .../hpds/data/genotype/VariantStore.java | 24 +- .../genotype/VariantMaskPerformanceTest.java | 66 +++++ .../VariantMasksSerializationTest.java | 27 +- .../etl/genotype/GenomicDatasetMerger.java | 71 ++--- 8 files changed, 533 insertions(+), 67 deletions(-) create mode 100644 data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java create mode 100644 data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java create mode 100644 data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java create mode 100644 data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java create mode 100644 data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java new file mode 100644 index 00000000..2968f56c --- /dev/null +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -0,0 +1,265 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.fasterxml.jackson.databind.ser.std.ToStringSerializer; + +import java.io.Serializable; +import java.math.BigInteger; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; + +public class VariableVariantMasks implements Serializable { + private static final long serialVersionUID = 6225420483804601477L; + private static final String oneone = "11"; + private static final char one = '1'; + private static final char zero = '0'; + private static final String hetero = "0/1"; + private static final String heteroDel = "1/0"; + private static final String heteroPhased = "0|1"; + private static final String heteroPhased2 = "1|0"; + private static final String homo = "1/1"; + private static final String homoPhased = "1|1"; + private static final String homoNoCall = "./."; + private static final String heteroNoCall = "./1"; + + private static Map emptyBitmaskMap = new ConcurrentHashMap<>(); + + private static int SPARSE_VARIANT_THRESHOLD = 5; + + /* + * These indices result from the char values of the 3 characters in a VCF sample + * record summed as integers % 7 + * + * This allows us to not actually do string comparisons, instead we add 3 values, + * do one modulus operation, then use the result as the index into the result array. + */ + + // ./0 = (46 + 47 + 48) % 7 = 1 + // 0|. = (48 + 124 + 46) % 7 = 1 + // .|0 = (46 + 124 + 48) % 7 = 1 + public static final int HETERO_NOCALL_REF_CHAR_INDEX = 1; + + // ./1 = (46 + 47 + 49) % 7 = 2 + // 1|. = (49 + 124 + 46) % 7 = 2 + // .|1 = (46 + 124 + 49) % 7 = 2 + // ./1 = (46 + 47 + 49) % 7 = 2 + // 1|. = (49 + 124 + 46) % 7 = 2 + // .|1 = (46 + 124 + 49) % 7 = 2 + public static final int HETERO_NOCALL_VARIANT_CHAR_INDEX = 2; + + // 0/0 = (48 + 47 + 48) % 7 = 3 + // 0|0 = (48 + 124 + 48) % 7 = 3 + public static final int ZERO_ZERO_CHAR_INDEX = 3; + + // 0/1 = (48 + 47 + 49) % 7 = 4 + // 1|0 = (49 + 124 + 48) % 7 = 4 + // 0|1 = (48 + 124 + 49) % 7 = 4 + public static final int ZERO_ONE_CHAR_INDEX = 4; + + // 1/1 = (49 + 47 + 49) % 7 = 5 + // 1|1 = (49 + 124 + 49) % 7 = 5 + public static final int ONE_ONE_CHAR_INDEX = 5; + + // ./. = (46 + 47 + 46) % 7 = 6 + // .|. = (46 + 124 + 46) % 7 = 6 + public static final int HOMO_NOCALL_CHAR_INDEX = 6; + + public VariableVariantMasks(String[] values) { + char[] homozygousBits = new char[values.length]; + char[] heterozygousBits = new char[values.length]; + char[] homozygousNoCallBits = new char[values.length]; + char[] heterozygousNoCallBits = new char[values.length]; + length = values.length; + boolean hasHetero = false; + boolean hasHomo = false; + boolean hasHeteroNoCall = false; + boolean hasHomoNoCall = false; + + for(int x = 0;x patientIds; + + @JsonProperty("size") + private int size; + + public VariantMaskSparseImpl(@JsonProperty("ids") Set patientIds, @JsonProperty("size") int size) { + this.patientIds = patientIds; + this.size = size; + } + + @Override + public VariantMask intersection(VariantMask variantMask) { + throw new RuntimeException("Not implemented yet"); + } + + @Override + public VariantMask union(VariantMask variantMask) { + if (variantMask instanceof VariantMaskBitmaskImpl) { + return VariantMask.union(this, (VariantMaskBitmaskImpl) variantMask); + } else if (variantMask instanceof VariantMaskSparseImpl) { + return union((VariantMaskSparseImpl) variantMask); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + + private VariantMask union(VariantMaskSparseImpl variantMask) { + HashSet union = new HashSet<>(variantMask.patientIds); + union.addAll(this.patientIds); + return new VariantMaskSparseImpl(union, Math.max(variantMask.size, this.size)); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantMaskSparseImpl that = (VariantMaskSparseImpl) o; + return size == that.size && Objects.equals(patientIds, that.patientIds); + } + + @Override + public int hashCode() { + return Objects.hash(patientIds, size); + } +} diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index 3efdc3df..c91ebf4d 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -30,13 +30,13 @@ public class VariantStore implements Serializable { private String[] vcfHeaders = new String[24]; - private Map>> variantMaskStorage = new TreeMap<>(); + private Map>> variantMaskStorage = new TreeMap<>(); - public Map>> getVariantMaskStorage() { + public Map>> getVariantMaskStorage() { return variantMaskStorage; } - public void setVariantMaskStorage(Map>> variantMaskStorage) { + public void setVariantMaskStorage(Map>> variantMaskStorage) { this.variantMaskStorage = variantMaskStorage; } @@ -81,24 +81,24 @@ public Map countVariants() { TreeMap counts = new TreeMap<>(); for (String contig : variantMaskStorage.keySet()) { counts.put(contig, new int[5]); - FileBackedJsonIndexStorage> storage = variantMaskStorage + FileBackedJsonIndexStorage> storage = variantMaskStorage .get(contig); storage.keys().stream().forEach((Integer key) -> { int[] contigCounts = counts.get(contig); - Collection values = storage.get(key).values(); - contigCounts[0] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + Collection values = storage.get(key).values(); + contigCounts[0] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.heterozygousMask != null ? 1 : 0; })); - contigCounts[1] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[1] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.homozygousMask != null ? 1 : 0; })); - contigCounts[2] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[2] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.heterozygousNoCallMask != null ? 1 : 0; })); - contigCounts[3] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[3] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.homozygousNoCallMask != null ? 1 : 0; })); - contigCounts[4] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[4] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.heterozygousMask != null || masks.homozygousMask != null || masks.heterozygousNoCallMask != null || masks.homozygousNoCallMask != null ? 1 : 0; })); @@ -115,7 +115,7 @@ public String[] getPatientIds() { return patientIds; } - public Optional getMasks(String variant, VariantBucketHolder bucketCache) throws IOException { + public Optional getMasks(String variant, VariantBucketHolder bucketCache) throws IOException { String[] segments = variant.split(","); if (segments.length < 2) { log.error("Less than 2 segments found in this variant : " + variant); @@ -134,7 +134,7 @@ public Optional getMasks(String variant, VariantBucketHolder> indexedStorage = variantMaskStorage.get(contig); + FileBackedJsonIndexStorage> indexedStorage = variantMaskStorage.get(contig); if (indexedStorage == null) { return Optional.empty(); } diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java new file mode 100644 index 00000000..803c9b94 --- /dev/null +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java @@ -0,0 +1,66 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import org.junit.jupiter.api.Test; + +import java.math.BigInteger; +import java.util.Random; +import java.util.Set; + +public class VariantMaskPerformanceTest { + + /** + * This test shows the ideal maximum size for a sparse variant is between 5-10 items. The slight decrease in performance + * is generally worth the drastic reduction in disks space + */ + //@Test + public void test() { + VariantMaskBitmaskImpl mask100k = new VariantMaskBitmaskImpl(generateRandomBitmask(100_000)); + VariantMaskBitmaskImpl mask100k2 = new VariantMaskBitmaskImpl(generateRandomBitmask(100_000)); + VariantMaskBitmaskImpl mask1m = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); + VariantMaskBitmaskImpl mask1m2 = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); + VariantMaskSparseImpl sparseMask100k = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000), 100_000); + VariantMaskSparseImpl sparseMask100k2 = new VariantMaskSparseImpl(Set.of(100, 101, 200, 300, 400, 1000, 20_000, 30_000, 50_000, 90_000), 100_000); + VariantMaskSparseImpl sparseMask1m = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000, 300_000, 420_000, 555_555, 867_530, 999_999), 1_000_000); + + long time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = mask100k.union(mask100k2); + } + System.out.println(mask100k.getBitmask().bitLength() + " bitmask union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask100k.union(mask100k); + } + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask100k2.union(mask100k); + } + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k2.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = mask1m.union(mask1m2); + } + System.out.println(mask1m.getBitmask().bitLength() + " bitmask union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask100k.union(mask1m); + } + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask1m.union(mask1m); + } + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask1m.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + } + + private BigInteger generateRandomBitmask(int patients) { + return new BigInteger(patients + 4, new Random()); + } +} diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java index d3b12fc2..5879b809 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java @@ -5,6 +5,7 @@ import org.junit.jupiter.api.Test; import java.math.BigInteger; +import java.util.Set; import java.util.concurrent.ThreadLocalRandom; import static org.junit.jupiter.api.Assertions.*; @@ -16,25 +17,25 @@ public void testFieldMaxLength() throws JsonProcessingException { VariantMasks variantMasks = new VariantMasks(); StringBuilder homozygousStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { homozygousStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.homozygousMask = new BigInteger(homozygousStr.toString()); StringBuilder heterozygousStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { heterozygousStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.heterozygousMask = new BigInteger(heterozygousStr.toString()); StringBuilder homozygousNoCallStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { homozygousNoCallStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.homozygousNoCallMask = new BigInteger(homozygousNoCallStr.toString()); StringBuilder heterozygousNoCallStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { heterozygousNoCallStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.heterozygousNoCallMask = new BigInteger(heterozygousNoCallStr.toString()); @@ -45,4 +46,22 @@ public void testFieldMaxLength() throws JsonProcessingException { assertEquals(variantMasks, deserialized); } + + + + @Test + public void testVariableVariantMasks() throws JsonProcessingException { + VariableVariantMasks variableVariantMasks = new VariableVariantMasks(); + variableVariantMasks.heterozygousMask = new VariantMaskSparseImpl(Set.of(1, 2, 3), 1000); + variableVariantMasks.homozygousMask = new VariantMaskBitmaskImpl(new BigInteger("1010101010101010")); + variableVariantMasks.heterozygousNoCallMask = new VariantMaskSparseImpl(Set.of(), 1000); + variableVariantMasks.homozygousNoCallMask = null; + + ObjectMapper objectMapper = new ObjectMapper(); + String serialized = objectMapper.writeValueAsString(variableVariantMasks); + System.out.println(serialized); + VariableVariantMasks deserialized = objectMapper.readValue(serialized, VariableVariantMasks.class); + + assertEquals(variableVariantMasks, deserialized); + } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 88b4fd14..d4f51026 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -15,8 +15,6 @@ import java.util.concurrent.ConcurrentSkipListSet; import java.util.stream.Collectors; import java.util.stream.Stream; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; public class GenomicDatasetMerger { @@ -55,7 +53,7 @@ private void validate() { } } - public VariantStore mergeVariantStore(Map>> mergedChromosomeMasks) { + public VariantStore mergeVariantStore(Map>> mergedChromosomeMasks) { mergedVariantStore.setVariantMaskStorage(mergedChromosomeMasks); mergedVariantStore.setPatientIds(mergePatientIds()); return mergedVariantStore; @@ -165,8 +163,8 @@ private String[] mergePatientIds() { * For each chromosome, call mergeChromosomeMask to merge the masks * @return */ - public Map>> mergeChromosomeMasks() { - Map>> mergedMaskStorage = new ConcurrentHashMap<>(); + public Map>> mergeChromosomeMasks() { + Map>> mergedMaskStorage = new ConcurrentHashMap<>(); variantStore1.getVariantMaskStorage().keySet().forEach(chromosome -> { try { mergedMaskStorage.put(chromosome, mergeChromosomeMask(chromosome)); @@ -219,77 +217,48 @@ public Map> mergeChromosomeMask(String chromosome) throws FileNotFoundException { - FileBackedJsonIndexStorage> variantMaskStorage1 = variantStore1.getVariantMaskStorage().get(chromosome); - FileBackedJsonIndexStorage> variantMaskStorage2 = variantStore2.getVariantMaskStorage().get(chromosome); + public FileBackedJsonIndexStorage> mergeChromosomeMask(String chromosome) throws FileNotFoundException { + FileBackedJsonIndexStorage> variantMaskStorage1 = variantStore1.getVariantMaskStorage().get(chromosome); + FileBackedJsonIndexStorage> variantMaskStorage2 = variantStore2.getVariantMaskStorage().get(chromosome); - FileBackedJsonIndexStorage> merged = new FileBackedStorageVariantMasksImpl(new File(outputDirectory + chromosome + "masks.bin")); + FileBackedJsonIndexStorage> merged = new FileBackedStorageVariantMasksImpl(new File(outputDirectory + chromosome + "masks.bin")); variantMaskStorage1.keys().parallelStream().forEach(key -> { - Map masks1 = variantMaskStorage1.get(key); - Map masks2 = variantMaskStorage2.get(key); + Map masks1 = variantMaskStorage1.get(key); + Map masks2 = variantMaskStorage2.get(key); if (masks2 == null) { masks2 = Map.of(); } - ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); - for (Map.Entry entry : masks1.entrySet()) { - VariantMasks variantMasks2 = masks2.get(entry.getKey()); + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + for (Map.Entry entry : masks1.entrySet()) { + VariableVariantMasks variantMasks2 = masks2.get(entry.getKey()); if (variantMasks2 == null) { // this will have all null masks, which will result in null when // appended to a null, or be replaced with an empty bitmask otherwise - variantMasks2 = new VariantMasks(); + variantMasks2 = new VariableVariantMasks(); } - mergedMasks.put(entry.getKey(), append(entry.getValue(), variantMasks2)); + mergedMasks.put(entry.getKey(), entry.getValue().append(variantMasks2)); } // Any entry in the second set that is not in the merged set can be merged with an empty variant mask, // if there were a corresponding entry in set 1, it would have been merged in the previous loop - for (Map.Entry entry : masks2.entrySet()) { + for (Map.Entry entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { + // todo: store empty variant masks as sparse to avoid this awkward null check mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); } } merged.put(key, mergedMasks); }); - ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); variantMaskStorage2.keys().forEach(key -> { - Map masks2 = variantMaskStorage2.get(key); - for (Map.Entry entry : masks2.entrySet()) { + Map masks2 = variantMaskStorage2.get(key); + for (Map.Entry entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); + mergedMasks.put(entry.getKey(), append(new VariableVariantMasks(), entry.getValue())); } } }); return merged; } - - public VariantMasks append(VariantMasks variantMasks1, VariantMasks variantMasks2) { - VariantMasks appendedMasks = new VariantMasks(); - appendedMasks.homozygousMask = appendMask(variantMasks1.homozygousMask, variantMasks2.homozygousMask); - appendedMasks.heterozygousMask = appendMask(variantMasks1.heterozygousMask, variantMasks2.heterozygousMask); - appendedMasks.homozygousNoCallMask = appendMask(variantMasks1.homozygousNoCallMask, variantMasks2.homozygousNoCallMask); - appendedMasks.heterozygousNoCallMask = appendMask(variantMasks1.heterozygousNoCallMask, variantMasks2.heterozygousNoCallMask); - return appendedMasks; - } - - /** - * Appends one mask to another. This assumes the masks are both padded with '11' on each end - * to prevent overflow issues. - */ - public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { - if (mask1 == null && mask2 == null) { - return null; - } - if (mask1 == null) { - mask1 = variantStore1.emptyBitmask(); - } - if (mask2 == null) { - mask2 = variantStore2.emptyBitmask(); - } - String binaryMask1 = mask1.toString(2); - String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2); - return new BigInteger(appendedString, 2); - } } From 8cf10bd9dc3bc1cdf7c65448f70ba0c79510b9c2 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Feb 2024 09:43:53 -0500 Subject: [PATCH 04/35] ALS-5819: Update vcf loader to use variable variant indecies --- .../data/genotype/BucketIndexBySample.java | 20 +- .../data/genotype/VariableVariantMasks.java | 206 ++++++++---------- .../hpds/data/genotype/VariantMask.java | 2 +- .../data/genotype/VariantMaskSparseImpl.java | 35 +-- .../FileBackedStorageVariantMasksImpl.java | 9 +- .../genotype/VariantMaskPerformanceTest.java | 17 +- .../VariantMasksSerializationTest.java | 6 +- .../etl/genotype/GenomicDatasetMerger.java | 28 ++- .../hpds/etl/genotype/NewVCFLoader.java | 49 +++-- 9 files changed, 179 insertions(+), 193 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 341c4794..3b89a329 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -43,7 +43,7 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws //Create a bucketList, containing keys for all buckets in the variantStore for(String contig: contigSet){ - FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); + FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); if(contigStore != null && contigStore.keys() != null) { bucketList.addAll(contigStore.keys().stream().map( (Integer bucket)->{ @@ -74,7 +74,7 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws patientBucketCharMasks[x] = emptyBucketMaskChar(); } contigSet.parallelStream().forEach((contig)->{ - FileBackedByteIndexedStorage> contigStore = + FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); if(contigStore != null && contigStore.keys() != null) { contigStore.keys().stream().forEach( @@ -82,22 +82,24 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws String bucketKey = contig + ":" + bucket; // Create a bitmask with 1 values for each patient who has any variant in this bucket - BigInteger[] patientMaskForBucket = {variantStore.emptyBitmask()}; - contigStore.get(bucket).values().forEach((VariantMasks masks)->{ + VariantMask[] patientMaskForBucket = {new VariantMaskSparseImpl(Set.of())}; + contigStore.get(bucket).values().forEach((VariableVariantMasks masks)->{ if(masks.heterozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.heterozygousMask); } //add hetreo no call bits to mask if(masks.heterozygousNoCallMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousNoCallMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.heterozygousNoCallMask); } if(masks.homozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.homozygousMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.homozygousMask); } }); // For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket. - int maxIndex = patientMaskForBucket[0].bitLength() - 1; + + // todo: implement for variant explorer + /*int maxIndex = patientMaskForBucket[0].bitLength() - 1; int indexOfBucket = Collections.binarySearch(bucketList, bucketKey) + 2; //patientBucketCharMasks has bookend bits for(int x = 2;x emptyBitmaskMap = new ConcurrentHashMap<>(); - private static int SPARSE_VARIANT_THRESHOLD = 5; + public static int SPARSE_VARIANT_THRESHOLD = 5; /* * These indices result from the char values of the 3 characters in a VCF sample @@ -68,121 +63,45 @@ public class VariableVariantMasks implements Serializable { // .|. = (46 + 124 + 46) % 7 = 6 public static final int HOMO_NOCALL_CHAR_INDEX = 6; - public VariableVariantMasks(String[] values) { - char[] homozygousBits = new char[values.length]; - char[] heterozygousBits = new char[values.length]; - char[] homozygousNoCallBits = new char[values.length]; - char[] heterozygousNoCallBits = new char[values.length]; - length = values.length; - boolean hasHetero = false; - boolean hasHomo = false; - boolean hasHeteroNoCall = false; - boolean hasHomoNoCall = false; - - for(int x = 0;x VariableVariantMasks.SPARSE_VARIANT_THRESHOLD) { + variantMask = new VariantMaskBitmaskImpl(bitmask); + } else { + Set patientIndexes = new HashSet<>(); + for(int i = 2; i < bitmask.bitLength() - 2; i++) { + if (bitmask.testBit(i)) { + patientIndexes.add(i); + } + } + variantMask = new VariantMaskSparseImpl(patientIndexes); + } + return variantMask; } public VariableVariantMasks() { } + public VariableVariantMasks(int length) { + this.length = length; + } @JsonProperty("ho") public VariantMask homozygousMask; @@ -252,14 +171,69 @@ public BigInteger appendMask(BigInteger mask1, int mask1Length, BigInteger mask2 public VariableVariantMasks append(VariableVariantMasks variantMasks) { VariableVariantMasks appendedMasks = new VariableVariantMasks(); - appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask); - appendedMasks.heterozygousMask = appendMask(this.heterozygousMask, variantMasks.heterozygousMask); - appendedMasks.homozygousNoCallMask = appendMask(this.homozygousNoCallMask, variantMasks.homozygousNoCallMask); - appendedMasks.heterozygousNoCallMask = appendMask(this.heterozygousNoCallMask, variantMasks.heterozygousNoCallMask); + appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask, this.length, variantMasks.length); + appendedMasks.heterozygousMask = appendMask(this.heterozygousMask, variantMasks.heterozygousMask, this.length, variantMasks.length); + appendedMasks.homozygousNoCallMask = appendMask(this.homozygousNoCallMask, variantMasks.homozygousNoCallMask, this.length, variantMasks.length); + appendedMasks.heterozygousNoCallMask = appendMask(this.heterozygousNoCallMask, variantMasks.heterozygousNoCallMask, this.length, variantMasks.length); return appendedMasks; } - private VariantMask appendMask(VariantMask homozygousMask, VariantMask homozygousMask1) { - return null; + private VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask2, int length1, int length2) { + if (variantMask1 instanceof VariantMaskSparseImpl) { + if (variantMask2 instanceof VariantMaskSparseImpl) { + return append((VariantMaskSparseImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); + } else if (variantMask2 instanceof VariantMaskBitmaskImpl) { + return append((VariantMaskSparseImpl) variantMask1, (VariantMaskBitmaskImpl) variantMask2, length1, length2); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + // todo: bitmask + else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + + private VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { + BigInteger mask1 = emptyBitmask(length1); + for (Integer patientId : variantMask1.patientIndexes) { + mask1 = mask1.setBit(patientId); + } + String binaryMask1 = mask1.toString(2); + String binaryMask2 = variantMask2.bitmask.toString(2); + String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + + binaryMask2.substring(2); + return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + } + private VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { + if (variantMask1.patientIndexes.size() + variantMask2.patientIndexes.size() > SPARSE_VARIANT_THRESHOLD) { + // todo: performance test this vs byte array + BigInteger mask = emptyBitmask(length1 + length2); + for (Integer patientId : variantMask1.patientIndexes) { + mask = mask.setBit(patientId + 2); + } + // todo: explain this. it is not intuitive + for (Integer patientId : variantMask2.patientIndexes) { + mask = mask.setBit(patientId + length1 + 2); + } + return new VariantMaskBitmaskImpl(mask); + } + else { + Set patientIndexSet = new HashSet<>(); + patientIndexSet.addAll(variantMask1.patientIndexes); + patientIndexSet.addAll(variantMask2.patientIndexes); + return new VariantMaskSparseImpl(patientIndexSet); + } + } + +/* if (mask1 == null) { + mask1 = variantStore1.emptyBitmask(); + } + if (mask2 == null) { + mask2 = variantStore2.emptyBitmask(); } + String binaryMask1 = mask1.toString(2); + String binaryMask2 = mask2.toString(2); + String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + + binaryMask2.substring(2);*/ } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java index 8379734c..ed828d67 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java @@ -23,7 +23,7 @@ public interface VariantMask { static VariantMask union(VariantMaskSparseImpl variantMaskSparse, VariantMaskBitmaskImpl variantMaskBitmask) { BigInteger union = variantMaskBitmask.bitmask; - for (Integer patientId : variantMaskSparse.patientIds) { + for (Integer patientId : variantMaskSparse.patientIndexes) { union = union.setBit(patientId + 2); } return new VariantMaskBitmaskImpl(union); diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java index b88efc33..8f0af1af 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -2,22 +2,20 @@ import com.fasterxml.jackson.annotation.JsonProperty; -import java.math.BigInteger; import java.util.HashSet; -import java.util.Objects; import java.util.Set; public class VariantMaskSparseImpl implements VariantMask { - @JsonProperty("ids") - protected Set patientIds; + @JsonProperty("p") + protected Set patientIndexes; - @JsonProperty("size") - private int size; + public VariantMaskSparseImpl(@JsonProperty("p") Set patientIndexes) { + this.patientIndexes = patientIndexes; + } - public VariantMaskSparseImpl(@JsonProperty("ids") Set patientIds, @JsonProperty("size") int size) { - this.patientIds = patientIds; - this.size = size; + public Set getPatientIndexes() { + return patientIndexes; } @Override @@ -37,21 +35,8 @@ public VariantMask union(VariantMask variantMask) { } private VariantMask union(VariantMaskSparseImpl variantMask) { - HashSet union = new HashSet<>(variantMask.patientIds); - union.addAll(this.patientIds); - return new VariantMaskSparseImpl(union, Math.max(variantMask.size, this.size)); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - VariantMaskSparseImpl that = (VariantMaskSparseImpl) o; - return size == that.size && Objects.equals(patientIds, that.patientIds); - } - - @Override - public int hashCode() { - return Objects.hash(patientIds, size); + HashSet union = new HashSet<>(variantMask.patientIndexes); + union.addAll(this.patientIndexes); + return new VariantMaskSparseImpl(union); } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java index 9d326a8c..bdd94a51 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.storage; import com.fasterxml.jackson.core.type.TypeReference; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; @@ -9,17 +10,17 @@ import java.io.Serializable; import java.util.concurrent.ConcurrentHashMap; -public class FileBackedStorageVariantMasksImpl extends FileBackedJsonIndexStorage> implements Serializable { +public class FileBackedStorageVariantMasksImpl extends FileBackedJsonIndexStorage> implements Serializable { private static final long serialVersionUID = -1086729119489479152L; public FileBackedStorageVariantMasksImpl(File storageFile) throws FileNotFoundException { super(storageFile); } - private static final TypeReference> typeRef - = new TypeReference>() {}; + private static final TypeReference> typeRef + = new TypeReference>() {}; @Override - public TypeReference> getTypeReference() { + public TypeReference> getTypeReference() { return typeRef; } } diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java index 803c9b94..aa470219 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java @@ -1,7 +1,5 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; -import org.junit.jupiter.api.Test; - import java.math.BigInteger; import java.util.Random; import java.util.Set; @@ -18,9 +16,9 @@ public void test() { VariantMaskBitmaskImpl mask100k2 = new VariantMaskBitmaskImpl(generateRandomBitmask(100_000)); VariantMaskBitmaskImpl mask1m = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); VariantMaskBitmaskImpl mask1m2 = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); - VariantMaskSparseImpl sparseMask100k = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000), 100_000); - VariantMaskSparseImpl sparseMask100k2 = new VariantMaskSparseImpl(Set.of(100, 101, 200, 300, 400, 1000, 20_000, 30_000, 50_000, 90_000), 100_000); - VariantMaskSparseImpl sparseMask1m = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000, 300_000, 420_000, 555_555, 867_530, 999_999), 1_000_000); + VariantMaskSparseImpl sparseMask100k = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000)); + VariantMaskSparseImpl sparseMask100k2 = new VariantMaskSparseImpl(Set.of(100, 101, 200, 300, 400, 1000, 20_000, 30_000, 50_000, 90_000)); + VariantMaskSparseImpl sparseMask1m = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000, 300_000, 420_000, 555_555, 867_530, 999_999)); long time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { @@ -32,13 +30,13 @@ public void test() { for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask100k.union(mask100k); } - System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask100k2.union(mask100k); } - System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k2.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k2.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { @@ -50,17 +48,18 @@ public void test() { for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask100k.union(mask1m); } - System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask1m.union(mask1m); } - System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask1m.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask1m.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); } private BigInteger generateRandomBitmask(int patients) { return new BigInteger(patients + 4, new Random()); } + } diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java index 5879b809..6bdbb3fc 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java @@ -52,9 +52,9 @@ public void testFieldMaxLength() throws JsonProcessingException { @Test public void testVariableVariantMasks() throws JsonProcessingException { VariableVariantMasks variableVariantMasks = new VariableVariantMasks(); - variableVariantMasks.heterozygousMask = new VariantMaskSparseImpl(Set.of(1, 2, 3), 1000); - variableVariantMasks.homozygousMask = new VariantMaskBitmaskImpl(new BigInteger("1010101010101010")); - variableVariantMasks.heterozygousNoCallMask = new VariantMaskSparseImpl(Set.of(), 1000); + variableVariantMasks.heterozygousMask = new VariantMaskSparseImpl(Set.of(1, 2, 3)); + variableVariantMasks.homozygousMask = new VariantMaskBitmaskImpl(new BigInteger("1101010101010101011")); + variableVariantMasks.heterozygousNoCallMask = new VariantMaskSparseImpl(Set.of()); variableVariantMasks.homozygousNoCallMask = null; ObjectMapper objectMapper = new ObjectMapper(); diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index d4f51026..09501345 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -235,7 +235,7 @@ public FileBackedJsonIndexStorage entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - // todo: store empty variant masks as sparse to avoid this awkward null check - mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); + mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); } } merged.put(key, mergedMasks); @@ -255,10 +254,31 @@ public FileBackedJsonIndexStorage masks2 = variantMaskStorage2.get(key); for (Map.Entry entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), append(new VariableVariantMasks(), entry.getValue())); + mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); } } }); return merged; } + + /** + * Appends one mask to another. This assumes the masks are both padded with '11' on each end + * to prevent overflow issues. + */ + /*public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { + if (mask1 == null && mask2 == null) { + return null; + } + if (mask1 == null) { + mask1 = variantStore1.emptyBitmask(); + } + if (mask2 == null) { + mask2 = variantStore2.emptyBitmask(); + } + String binaryMask1 = mask1.toString(2); + String binaryMask2 = mask2.toString(2); + String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + + binaryMask2.substring(2); + return new BigInteger(appendedString, 2); + }*/ } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 7727aba2..43f6c351 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -8,6 +8,7 @@ import java.util.stream.Collectors; import java.util.zip.GZIPOutputStream; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantMasksImpl; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; import org.apache.commons.csv.CSVFormat; @@ -16,9 +17,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoStore; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; import htsjdk.samtools.util.BlockCompressedInputStream; @@ -60,7 +58,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException private static HashMap zygosityMaskStrings; - private static TreeMap>> variantMaskStorage = new TreeMap<>(); + private static TreeMap>> variantMaskStorage = new TreeMap<>(); private static long startTime; @@ -180,7 +178,7 @@ private static void loadVCFs(File indexFile) throws IOException { int[] count = { 0 }; for (String contig : store.getVariantMaskStorage().keySet()) { ArrayList chunkIds = new ArrayList<>(); - FileBackedByteIndexedStorage> chromosomeStorage = store.getVariantMaskStorage() + FileBackedByteIndexedStorage> chromosomeStorage = store.getVariantMaskStorage() .get(contig); if (chromosomeStorage != null) { // print out the top and bottom 50 variants in the store (that have masks) @@ -188,11 +186,11 @@ private static void loadVCFs(File indexFile) throws IOException { for (Integer chunkId : chunkIds) { for (String variantSpec : chromosomeStorage.get(chunkId).keySet()) { count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + VariableVariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; + VariantMask heterozygousMask = variantMasks.heterozygousMask; String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; + VariantMask homozygousMask = variantMasks.homozygousMask; String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) @@ -210,11 +208,11 @@ private static void loadVCFs(File indexFile) throws IOException { int chunkId = chunkIds.get(x); chromosomeStorage.get(chunkId).keySet().forEach((variantSpec) -> { count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + VariableVariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; + VariantMask heterozygousMask = variantMasks.heterozygousMask; String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; + VariantMask homozygousMask = variantMasks.homozygousMask; String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) @@ -234,12 +232,19 @@ private static void loadVCFs(File indexFile) throws IOException { saveVariantStore(store, variantMaskStorage); } - private static String sampleIdsForMask(String[] sampleIds, BigInteger heterozygousMask) { + private static String sampleIdsForMask(String[] sampleIds, VariantMask variantMask) { String idList = ""; - if (heterozygousMask != null) { - for (int x = 2; x < heterozygousMask.bitLength() - 2; x++) { - if (heterozygousMask.testBit(heterozygousMask.bitLength() - 1 - x)) { - idList += sampleIds[x - 2] + ","; + if (variantMask != null) { + if (variantMask instanceof VariantMaskBitmaskImpl) { + BigInteger mask = ((VariantMaskBitmaskImpl) variantMask).getBitmask(); + for (int x = 2; x < mask.bitLength() - 2; x++) { + if (mask.testBit(mask.bitLength() - 1 - x)) { + idList += sampleIds[x - 2] + ","; + } + } + } else if (variantMask instanceof VariantMaskSparseImpl) { + for (Integer patientIndex : ((VariantMaskSparseImpl) variantMask).getPatientIndexes()) { + idList += sampleIds[patientIndex] + ","; } } } @@ -276,7 +281,7 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } if (!currentContig.contentEquals(lastContigProcessed) || currentChunk > lastChunkProcessed || isLastChunk) { // flip chunk - TreeMap>> variantMaskStorage_f = variantMaskStorage; + TreeMap>> variantMaskStorage_f = variantMaskStorage; HashMap zygosityMaskStrings_f = zygosityMaskStrings; String lastContigProcessed_f = lastContigProcessed; int lastChunkProcessed_f = lastChunkProcessed; @@ -306,10 +311,10 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } private static void saveVariantStore(VariantStore store, - TreeMap>> variantMaskStorage) + TreeMap>> variantMaskStorage) throws IOException, FileNotFoundException { store.setVariantMaskStorage(variantMaskStorage); - for (FileBackedByteIndexedStorage> storage : variantMaskStorage + for (FileBackedByteIndexedStorage> storage : variantMaskStorage .values()) { if (storage != null) storage.complete(); @@ -361,11 +366,11 @@ private static void shutdownChunkWriteExecutor() { } } - private static ConcurrentHashMap convertLoadingMapToMaskMap( + private static ConcurrentHashMap convertLoadingMapToMaskMap( HashMap zygosityMaskStrings_f) { - ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); + ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); zygosityMaskStrings_f.entrySet().parallelStream().forEach((entry) -> { - maskMap.put(entry.getKey(), new VariantMasks(entry.getValue())); + maskMap.put(entry.getKey(), new VariableVariantMasks(entry.getValue())); }); return maskMap; } From 64905933014754d7378266be7693921d660faba0 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Feb 2024 16:25:31 -0500 Subject: [PATCH 05/35] ALS-5819: Update all references to use new VariantMask --- .../data/genotype/VariableVariantMasks.java | 34 ++++++-- .../hpds/data/genotype/VariantMask.java | 9 ++ .../data/genotype/VariantMaskBitmaskImpl.java | 10 +++ .../data/genotype/VariantMaskSparseImpl.java | 10 +++ .../genotype/GenomicDatasetMergerRunner.java | 3 +- .../genomic/GenomicProcessorController.java | 3 +- .../hpds/processing/AbstractProcessor.java | 6 +- .../hpds/processing/GenomicProcessor.java | 10 ++- .../hpds/processing/GenomicProcessorNoOp.java | 10 ++- .../processing/GenomicProcessorNodeImpl.java | 84 +++++++++---------- .../GenomicProcessorParentImpl.java | 27 ++---- ...omicProcessorPatientMergingParentImpl.java | 61 ++++++++------ .../processing/PatientVariantJoinHandler.java | 32 +++---- .../hpds/processing/QueryProcessor.java | 5 +- .../hpds/processing/VariantListProcessor.java | 41 ++++----- .../hpds/processing/VariantService.java | 3 +- .../genomic/GenomicProcessorRestClient.java | 14 ++-- .../processing/AbstractProcessorTest.java | 5 +- .../GenomicProcessorParentImplTest.java | 12 +-- ...ProcessorPatientMergingParentImplTest.java | 32 +++---- .../PatientVariantJoinHandlerTest.java | 65 +++++++------- .../GenomicProcessorRestClientTest.java | 3 +- 22 files changed, 259 insertions(+), 220 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 69986c20..5d8efd3d 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -6,6 +6,7 @@ import java.math.BigInteger; import java.util.*; import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; public class VariableVariantMasks implements Serializable { private static final long serialVersionUID = 6225420483804601477L; @@ -133,7 +134,7 @@ public int hashCode() { return Objects.hash(homozygousMask, heterozygousMask, homozygousNoCallMask, heterozygousNoCallMask); } - public BigInteger emptyBitmask(int length) { + public static BigInteger emptyBitmask(int length) { BigInteger emptyBitmask = emptyBitmaskMap.get(length); if (emptyBitmask == null) { String emptyVariantMask = ""; @@ -178,7 +179,7 @@ public VariableVariantMasks append(VariableVariantMasks variantMasks) { return appendedMasks; } - private VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask2, int length1, int length2) { + public static VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask2, int length1, int length2) { if (variantMask1 instanceof VariantMaskSparseImpl) { if (variantMask2 instanceof VariantMaskSparseImpl) { return append((VariantMaskSparseImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); @@ -194,18 +195,18 @@ private VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask } } - private VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { + private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { BigInteger mask1 = emptyBitmask(length1); for (Integer patientId : variantMask1.patientIndexes) { mask1 = mask1.setBit(patientId); } String binaryMask1 = mask1.toString(2); String binaryMask2 = variantMask2.bitmask.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2); + String appendedString = binaryMask2.substring(0, binaryMask1.length() - 2) + + binaryMask1.substring(2); return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); } - private VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { + private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { if (variantMask1.patientIndexes.size() + variantMask2.patientIndexes.size() > SPARSE_VARIANT_THRESHOLD) { // todo: performance test this vs byte array BigInteger mask = emptyBitmask(length1 + length2); @@ -226,6 +227,27 @@ private VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparse } } + public static Set patientMaskToPatientIdSet(VariantMask patientMask, List patientIds) { + if (patientMask instanceof VariantMaskBitmaskImpl) { + Set ids = new HashSet<>(); + String bitmaskString = ((VariantMaskBitmaskImpl) patientMask).getBitmask().toString(2); + for(int x = 2;x < bitmaskString.length()-2;x++) { + if('1'==bitmaskString.charAt(x)) { + String patientId = patientIds.get(x-2).trim(); + ids.add(Integer.parseInt(patientId)); + } + } + return ids; + } else if (patientMask instanceof VariantMaskSparseImpl) { + return ((VariantMaskSparseImpl) patientMask).getPatientIndexes().stream() + .map(patientIds::get) + .map(String::trim) + .map(Integer::parseInt) + .collect(Collectors.toSet()); + } + throw new IllegalArgumentException("Unknown VariantMask implementation"); + } + /* if (mask1 == null) { mask1 = variantStore1.emptyBitmask(); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java index ed828d67..46b24e6a 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java @@ -4,6 +4,7 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; import java.math.BigInteger; +import java.util.Set; @JsonTypeInfo( use = JsonTypeInfo.Id.NAME, @@ -21,6 +22,14 @@ public interface VariantMask { VariantMask union(VariantMask variantMask); + boolean testBit(int bit); + + int bitCount(); + + static VariantMask emptyInstance() { + return new VariantMaskSparseImpl(Set.of()); + } + static VariantMask union(VariantMaskSparseImpl variantMaskSparse, VariantMaskBitmaskImpl variantMaskBitmask) { BigInteger union = variantMaskBitmask.bitmask; for (Integer patientId : variantMaskSparse.patientIndexes) { diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java index 758b2728..532731bc 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -40,6 +40,16 @@ public VariantMask union(VariantMask variantMask) { } } + @Override + public boolean testBit(int bit) { + return bitmask.testBit(bit); + } + + @Override + public int bitCount() { + return bitmask.bitCount(); + } + private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) { return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.and(this.bitmask)); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java index 8f0af1af..215f5088 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -34,6 +34,16 @@ public VariantMask union(VariantMask variantMask) { } } + @Override + public boolean testBit(int bit) { + return patientIndexes.contains(bit); + } + + @Override + public int bitCount() { + return patientIndexes.size(); + } + private VariantMask union(VariantMaskSparseImpl variantMask) { HashSet union = new HashSet<>(variantMask.patientIndexes); union.addAll(this.patientIndexes); diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java index 70565730..3559790e 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java @@ -2,6 +2,7 @@ import com.google.common.base.Preconditions; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; @@ -46,7 +47,7 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio GenomicDatasetMerger genomicDatasetMerger = new GenomicDatasetMerger(VariantStore.readInstance(genomicDirectory1),VariantStore.readInstance(genomicDirectory2), infoStores1, infoStores2, outputDirectory); - Map>> mergedChromosomeMasks = genomicDatasetMerger.mergeChromosomeMasks(); + Map>> mergedChromosomeMasks = genomicDatasetMerger.mergeChromosomeMasks(); VariantStore mergedVariantStore = genomicDatasetMerger.mergeVariantStore(mergedChromosomeMasks); Map variantIndexes = genomicDatasetMerger.mergeVariantIndexes(); diff --git a/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java b/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java index ca08b584..f6033fc7 100644 --- a/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java +++ b/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.genomic; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.processing.DistributableQuery; import edu.harvard.hms.dbmi.avillach.hpds.processing.GenomicProcessor; import org.springframework.beans.factory.annotation.Autowired; @@ -24,7 +25,7 @@ public GenomicProcessorController(GenomicProcessor genomicProcessor) { } @PostMapping("/patients") - public Mono queryForPatientMask(@RequestBody DistributableQuery distributableQuery) { + public Mono queryForPatientMask(@RequestBody DistributableQuery distributableQuery) { return genomicProcessor.getPatientMask(distributableQuery); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java index 659aea5a..21bb70ea 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java @@ -133,7 +133,7 @@ protected Set idSetsForEachFilter(Query query) { // NULL (representing no phenotypic filters, i.e. all patients) or not empty patient ID sets require a genomic query. // Otherwise, short circuit and return no patients if ((distributableQuery.getPatientIds() == null || !distributableQuery.getPatientIds().isEmpty()) && distributableQuery.hasFilters()) { - Mono patientMaskForVariantInfoFilters = genomicProcessor.getPatientMask(distributableQuery); + Mono patientMaskForVariantInfoFilters = genomicProcessor.getPatientMask(distributableQuery); return patientMaskForVariantInfoFilters.map(genomicProcessor::patientMaskToPatientIdSet).block(); } return distributableQuery.getPatientIds(); @@ -360,12 +360,12 @@ public List getPatientIds() { return genomicProcessor.getPatientIds(); } - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { return genomicProcessor.getMasks(path, variantMasksVariantBucketHolder); } // todo: handle this locally, we do not want this in the genomic processor - protected BigInteger createMaskForPatientSet(Set patientSubset) { + protected VariantMask createMaskForPatientSet(Set patientSubset) { return genomicProcessor.createMaskForPatientSet(patientSubset); } } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java index 754f5ab0..8759338b 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java @@ -1,6 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import reactor.core.publisher.Mono; @@ -12,17 +14,17 @@ import java.util.Set; public interface GenomicProcessor { - Mono getPatientMask(DistributableQuery distributableQuery); + Mono getPatientMask(DistributableQuery distributableQuery); - Set patientMaskToPatientIdSet(BigInteger patientMask); + Set patientMaskToPatientIdSet(VariantMask patientMask); - BigInteger createMaskForPatientSet(Set patientSubset); + VariantMask createMaskForPatientSet(Set patientSubset); Mono> getVariantList(DistributableQuery distributableQuery); List getPatientIds(); - Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder); + Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder); Set getInfoStoreColumns(); diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java index 4dce5b37..11013ffd 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java @@ -1,6 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import reactor.core.publisher.Mono; @@ -13,17 +15,17 @@ public class GenomicProcessorNoOp implements GenomicProcessor { @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { + public Mono getPatientMask(DistributableQuery distributableQuery) { return null; } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { + public Set patientMaskToPatientIdSet(VariantMask patientMask) { return null; } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { return null; } @@ -38,7 +40,7 @@ public List getPatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { return Optional.empty(); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java index 9e1c6d7f..975e993b 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java @@ -3,9 +3,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.Range; import com.google.common.collect.Sets; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Filter; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; @@ -77,10 +75,10 @@ public GenomicProcessorNodeImpl(String genomicDataDirectory) { } @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { + public Mono getPatientMask(DistributableQuery distributableQuery) { return Mono.fromCallable(() -> runGetPatientMask(distributableQuery)).subscribeOn(Schedulers.boundedElastic()); } - public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { + public VariantMask runGetPatientMask(DistributableQuery distributableQuery) { // log.debug("filterdIDSets START size: " + filteredIdSets.size()); /* VARIANT INFO FILTER HANDLING IS MESSY */ if(distributableQuery.hasFilters()) { @@ -103,7 +101,7 @@ public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { // todo: handle empty getVariantInfoFilters() // add filteredIdSet for patients who have matching variants, heterozygous or homozygous for now. - BigInteger patientMask = null; + VariantMask patientMask = null; if (intersectionOfInfoFilters != null ){ patientMask = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(distributableQuery.getPatientIds(), intersectionOfInfoFilters); } else { @@ -111,24 +109,24 @@ public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { } - VariantBucketHolder variantMasksVariantBucketHolder = new VariantBucketHolder<>(); + VariantBucketHolder variantMasksVariantBucketHolder = new VariantBucketHolder<>(); if (!distributableQuery.getRequiredFields().isEmpty() ) { for (String variantSpec : distributableQuery.getRequiredFields()) { - BigInteger patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(new String[]{"0/1", "1/1"}, variantSpec, variantMasksVariantBucketHolder); + VariantMask patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(new String[]{"0/1", "1/1"}, variantSpec, variantMasksVariantBucketHolder); if (patientMask == null) { patientMask = patientsForVariantSpec; } else { - patientMask = patientMask.and(patientsForVariantSpec); + patientMask = patientMask.intersection(patientsForVariantSpec); } } } if (!distributableQuery.getCategoryFilters().isEmpty()) { for (Map.Entry categoryFilterEntry : distributableQuery.getCategoryFilters().entrySet()) { - BigInteger patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(categoryFilterEntry.getValue(), categoryFilterEntry.getKey(), null); + VariantMask patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(categoryFilterEntry.getValue(), categoryFilterEntry.getKey(), null); if (patientMask == null) { patientMask = patientsForVariantSpec; } else { - patientMask = patientMask.and(patientsForVariantSpec); + patientMask = patientMask.intersection(patientsForVariantSpec); } } } @@ -140,16 +138,8 @@ public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new HashSet<>(); - String bitmaskString = patientMask.toString(2); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = variantService.getPatientIds()[x-2].trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; + public Set patientMaskToPatientIdSet(VariantMask patientMask) { + return VariableVariantMasks.patientMaskToPatientIdSet(patientMask, getPatientIds()); } private List getVariantsMatchingFilters(Query.VariantInfoFilter filter) { @@ -212,8 +202,8 @@ private List filterInfoCategoryKeys(String[] values, FileBackedByteIndex } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { - return patientVariantJoinHandler.createMaskForPatientSet(patientSubset); + public VariantMask createMaskForPatientSet(Set patientSubset) { + return new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(patientSubset)); } private FileBackedByteIndexedInfoStore getInfoStore(String column) { @@ -265,24 +255,25 @@ public Collection runGetVariantList(DistributableQuery query) { return unionOfInfoFilters.mapToVariantSpec(variantService.getVariantIndex()); } - BigInteger patientMasks = createMaskForPatientSet(patientSubset); + VariantMask patientMasks = createMaskForPatientSet(patientSubset); Set unionOfInfoFiltersVariantSpecs = unionOfInfoFilters.mapToVariantSpec(variantService.getVariantIndex()); Collection variantsInScope = variantService.filterVariantSetForPatientSet(unionOfInfoFiltersVariantSpecs, new ArrayList<>(patientSubset)); //NC - this is the original variant filtering, which checks the patient mask from each variant against the patient mask from the query if(variantsInScope.size()<100000) { - ConcurrentSkipListSet variantsWithPatients = new ConcurrentSkipListSet(); + ConcurrentSkipListSet variantsWithPatients = new ConcurrentSkipListSet<>(); variantsInScope.parallelStream().forEach(variantKey -> { variantService.getMasks(variantKey, new VariantBucketHolder<>()).ifPresent(masks -> { - if ( masks.heterozygousMask != null && masks.heterozygousMask.and(patientMasks).bitCount()>4) { + // todo: implement for variant explorer + /*if ( masks.heterozygousMask != null && masks.heterozygousMask.intersection(patientMasks).bitCount()>4) { variantsWithPatients.add(variantKey); - } else if ( masks.homozygousMask != null && masks.homozygousMask.and(patientMasks).bitCount()>4) { + } else if ( masks.homozygousMask != null && masks.homozygousMask.intersection(patientMasks).bitCount()>4) { variantsWithPatients.add(variantKey); - } else if ( masks.heterozygousNoCallMask != null && masks.heterozygousNoCallMask.and(patientMasks).bitCount()>4) { + } else if ( masks.heterozygousNoCallMask != null && masks.heterozygousNoCallMask.intersection(patientMasks).bitCount()>4) { //so heterozygous no calls we want, homozygous no calls we don't variantsWithPatients.add(variantKey); - } + }*/ }); }); return variantsWithPatients; @@ -293,34 +284,35 @@ public Collection runGetVariantList(DistributableQuery query) { return new ArrayList<>(); } - private BigInteger getIdSetForVariantSpecCategoryFilter(String[] zygosities, String key, VariantBucketHolder bucketCache) { - List variantBitmasks = getBitmasksForVariantSpecCategoryFilter(zygosities, key, bucketCache); + private VariantMask getIdSetForVariantSpecCategoryFilter(String[] zygosities, String key, VariantBucketHolder bucketCache) { + List variantBitmasks = getBitmasksForVariantSpecCategoryFilter(zygosities, key, bucketCache); Set patientIds = new HashSet<>(); if(!variantBitmasks.isEmpty()) { - BigInteger bitmask = variantBitmasks.get(0); + VariantMask variantMask = variantBitmasks.get(0); if(variantBitmasks.size()>1) { for(int x = 1;x()); + return VariantMask.emptyInstance(); } - private ArrayList getBitmasksForVariantSpecCategoryFilter(String[] zygosities, String variantName, VariantBucketHolder bucketCache) { - ArrayList variantBitmasks = new ArrayList<>(); + private ArrayList getBitmasksForVariantSpecCategoryFilter(String[] zygosities, String variantName, VariantBucketHolder bucketCache) { + ArrayList variantBitmasks = new ArrayList<>(); variantName = variantName.replaceAll(",\\d/\\d$", ""); log.debug("looking up mask for : " + variantName); - Optional optionalMasks = variantService.getMasks(variantName, bucketCache); + Optional optionalMasks = variantService.getMasks(variantName, bucketCache); Arrays.stream(zygosities).forEach((zygosity) -> { optionalMasks.ifPresent(masks -> { if(zygosity.equals(HOMOZYGOUS_REFERENCE)) { - BigInteger homozygousReferenceBitmask = calculateIndiscriminateBitmask(masks); + // todo: implement for VariantMask. I don't think this logic was sound previously + /*VariantMask homozygousReferenceBitmask = calculateIndiscriminateBitmask(masks); for(int x = 2;x getBitmasksForVariantSpecCategoryFilter(String[] z } }); if (optionalMasks.isEmpty()) { - variantBitmasks.add(variantService.emptyBitmask()); + variantBitmasks.add(new VariantMaskBitmaskImpl(variantService.emptyBitmask())); } }); @@ -342,16 +334,16 @@ private ArrayList getBitmasksForVariantSpecCategoryFilter(String[] z * @param masks * @return */ - private BigInteger calculateIndiscriminateBitmask(VariantMasks masks) { - BigInteger indiscriminateVariantBitmask = null; + private VariantMask calculateIndiscriminateBitmask(VariableVariantMasks masks) { + VariantMask indiscriminateVariantBitmask = null; if(masks.heterozygousMask == null && masks.homozygousMask != null) { indiscriminateVariantBitmask = masks.homozygousMask; }else if(masks.homozygousMask == null && masks.heterozygousMask != null) { indiscriminateVariantBitmask = masks.heterozygousMask; }else if(masks.homozygousMask != null && masks.heterozygousMask != null) { - indiscriminateVariantBitmask = masks.heterozygousMask.or(masks.homozygousMask); + indiscriminateVariantBitmask = masks.heterozygousMask.intersection(masks.homozygousMask); }else { - indiscriminateVariantBitmask = variantService.emptyBitmask(); + indiscriminateVariantBitmask = new VariantMaskBitmaskImpl(variantService.emptyBitmask()); } return indiscriminateVariantBitmask; } @@ -362,7 +354,7 @@ public List getPatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { return variantService.getMasks(path, variantMasksVariantBucketHolder); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java index 0806dff8..28226df7 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java @@ -3,8 +3,7 @@ import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,28 +47,20 @@ public GenomicProcessorParentImpl(List nodes) { } @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { - Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) + public Mono getPatientMask(DistributableQuery distributableQuery) { + Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) .flatMap(node -> node.getPatientMask(distributableQuery)) - .reduce(BigInteger::or); + .reduce(VariantMask::union); return result; } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new HashSet<>(); - String bitmaskString = patientMask.toString(2); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = getPatientIds().get(x-2).trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; + public Set patientMaskToPatientIdSet(VariantMask patientMask) { + return VariableVariantMasks.patientMaskToPatientIdSet(patientMask, getPatientIds()); } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { throw new RuntimeException("Not implemented"); } @@ -111,10 +102,10 @@ private List initializePatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { // TODO: test. only used in variant explorer for (GenomicProcessor node : nodes) { - Optional masks = node.getMasks(path, variantMasksVariantBucketHolder); + Optional masks = node.getMasks(path, variantMasksVariantBucketHolder); if (masks.isPresent()) { return masks; } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java index 35e80ceb..c13d7ef8 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java @@ -4,8 +4,7 @@ import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.ImmutableList; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,38 +45,50 @@ public GenomicProcessorPatientMergingParentImpl(List nodes) { infoColumnsMeta = initInfoColumnsMeta(); } + private class SizedVariantMask { + private final VariantMask variantMask; + private final int size; + + public VariantMask getVariantMask() { + return variantMask; + } + + public int getSize() { + return size; + } + + public SizedVariantMask(VariantMask variantMask, int size) { + this.variantMask = variantMask; + this.size = size; + } + } + @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { - Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) - .flatMapSequential(node -> node.getPatientMask(distributableQuery)) - .reduce(this::appendMask); + public Mono getPatientMask(DistributableQuery distributableQuery) { + Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) + .flatMapSequential(node -> { + return node.getPatientMask(distributableQuery).map(mask -> { + return new SizedVariantMask(mask, node.getPatientIds().size()); + }); + }) + .reduce(this::appendMask) + .map(SizedVariantMask::getVariantMask); return result; } - public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { - String binaryMask1 = mask1.toString(2); - String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2); - return new BigInteger(appendedString, 2); + /** A little bit of a hack for now since the masks don't have sizes at this point and they are needed to merge + */ + public SizedVariantMask appendMask(SizedVariantMask mask1, SizedVariantMask mask2) { + return new SizedVariantMask(VariableVariantMasks.appendMask(mask1.variantMask, mask2.variantMask, mask1.size, mask2.size), mask1.size + mask2.size); } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new HashSet<>(); - String bitmaskString = patientMask.toString(2); - List patientIds = getPatientIds(); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = patientIds.get(x-2).trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; + public Set patientMaskToPatientIdSet(VariantMask patientMask) { + return VariableVariantMasks.patientMaskToPatientIdSet(patientMask, getPatientIds()); } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { throw new RuntimeException("Not implemented"); } @@ -116,7 +127,7 @@ private List initializePatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { // TODO: implement this. only used in variant explorer throw new RuntimeException("Method not implemented"); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java index dac17a4a..01d11150 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java @@ -3,8 +3,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.collect.Sets; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantSpec; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,7 +24,7 @@ public PatientVariantJoinHandler(VariantService variantService) { this.variantService = variantService; } - public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patientSubset, + public VariantMask getPatientIdsForIntersectionOfVariantSets(Set patientSubset, VariantIndex intersectionOfInfoFilters) { if(!intersectionOfInfoFilters.isEmpty()) { @@ -45,10 +44,10 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient // If genomic data is sharded by studies, it may be possible that the node this is running in does not have genomic // data for any of the patients in the phenotypic query. In which case, we don't have to look for matching patients if (patientsInScope.isEmpty()) { - return variantService.emptyBitmask(); + return new VariantMaskSparseImpl(Set.of()); } - BigInteger[] matchingPatients = new BigInteger[] {variantService.emptyBitmask()}; + VariantMask[] matchingPatients = new VariantMask[] {new VariantMaskSparseImpl(Set.of())}; Set variantsInScope = intersectionOfInfoFilters.mapToVariantSpec(variantService.getVariantIndex()); @@ -71,23 +70,23 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient log.info("and partitioned those into " + variantBucketPartitions.size() + " groups"); int patientsInScopeSize = patientsInScope.size(); - BigInteger patientsInScopeMask = createMaskForPatientSet(patientsInScope); + VariantMask patientsInScopeMask = new VariantMaskBitmaskImpl(createMaskForPatientSet(patientsInScope)); for(int x = 0; - x < variantBucketPartitions.size() && matchingPatients[0].bitCount() < patientsInScopeSize + 4; + x < variantBucketPartitions.size() /*&& matchingPatients[0].bitCount() < patientsInScopeSize + 4*/; x++) { List> variantBuckets = variantBucketPartitions.get(x); variantBuckets.parallelStream().forEach(variantBucket -> { - VariantBucketHolder bucketCache = new VariantBucketHolder<>(); + VariantBucketHolder bucketCache = new VariantBucketHolder<>(); List missingVariants = new ArrayList<>(); variantBucket.forEach(variantSpec -> { - Optional variantMask = variantService.getMasks(variantSpec, bucketCache); + Optional variantMask = variantService.getMasks(variantSpec, bucketCache); variantMask.ifPresentOrElse(masks -> { - BigInteger heteroMask = masks.heterozygousMask == null ? variantService.emptyBitmask() : masks.heterozygousMask; - BigInteger homoMask = masks.homozygousMask == null ? variantService.emptyBitmask() : masks.homozygousMask; - BigInteger orMasks = heteroMask.or(homoMask); - log.info("Patients with variant " + variantSpec + ": " + (orMasks.bitCount() - 4)); + VariantMask heteroMask = masks.heterozygousMask; + VariantMask homoMask = masks.homozygousMask; + VariantMask orMasks = heteroMask.union(homoMask); + //log.info("Patients with variant " + variantSpec + ": " + (orMasks.bitCount() - 4)); synchronized(matchingPatients) { - matchingPatients[0] = matchingPatients[0].or(orMasks); + matchingPatients[0] = matchingPatients[0].union(orMasks); } }, () -> missingVariants.add(variantSpec)); }); @@ -97,13 +96,14 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient } }); } - return matchingPatients[0].and(patientsInScopeMask); + return matchingPatients[0].intersection(patientsInScopeMask); }else { log.error("No matches found for info filters."); - return createMaskForPatientSet(new HashSet<>()); + return new VariantMaskSparseImpl(Set.of()); } } + // todo: return VariantMask public BigInteger createMaskForPatientSet(Set patientSubset) { StringBuilder builder = new StringBuilder("11"); //variant bitmasks are bookended with '11' for(String patientId : variantService.getPatientIds()) { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java index 18a30531..08d83bc6 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java @@ -4,6 +4,7 @@ import java.util.*; import java.util.stream.Collectors; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -106,7 +107,7 @@ private void processColumn(List paths, TreeSet ids, ResultStore String path = paths.get(x-1); if(VariantUtils.pathIsVariantSpec(path)) { // todo: confirm this entire if block is even used. I don't think it is - Optional masks = abstractProcessor.getMasks(path, new VariantBucketHolder<>()); + Optional masks = abstractProcessor.getMasks(path, new VariantBucketHolder<>()); List patientIds = abstractProcessor.getPatientIds(); int idPointer = 0; @@ -162,7 +163,7 @@ private void writeVariantNullResultField(ResultStore results, Integer x, ByteBuf results.writeField(x,idInSubsetPointer, valueBuffer); } - private int writeVariantResultField(ResultStore results, Integer x, Optional variantMasks, int idPointer, + private int writeVariantResultField(ResultStore results, Integer x, Optional variantMasks, int idPointer, int idInSubsetPointer) { byte[] valueBuffer = variantMasks.map(masks -> { if(masks.heterozygousMask != null && masks.heterozygousMask.testBit(idPointer)) { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java index 925a2394..7689d750 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java @@ -6,12 +6,10 @@ import java.util.*; import java.util.stream.Collectors; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMetadataIndex; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantSpec; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; @@ -185,7 +183,7 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws Set patientSubset = abstractProcessor.getPatientSubsetForQuery(query); log.debug("identified " + patientSubset.size() + " patients from query"); Map patientIndexMap = new LinkedHashMap(); //keep a map for quick index lookups - BigInteger patientMasks = abstractProcessor.createMaskForPatientSet(patientSubset); + VariantMask patientMasks = abstractProcessor.createMaskForPatientSet(patientSubset); int index = 2; //variant bitmasks are bookended with '11' @@ -215,7 +213,7 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws } //End of headers builder.append("\n"); - VariantBucketHolder variantMaskBucketHolder = new VariantBucketHolder(); + VariantBucketHolder variantMaskBucketHolder = new VariantBucketHolder(); //loop over the variants identified, and build an output row metadata.forEach((String variantSpec, String[] variantMetadata)->{ @@ -262,33 +260,22 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws } // todo: deal with empty return - VariantMasks masks = abstractProcessor.getMasks(variantSpec, variantMaskBucketHolder).get(); + VariableVariantMasks masks = abstractProcessor.getMasks(variantSpec, variantMaskBucketHolder).get(); //make strings of 000100 so we can just check 'char at' //so heterozygous no calls we want, homozygous no calls we don't - BigInteger heteroMask = masks.heterozygousMask != null? masks.heterozygousMask : masks.heterozygousNoCallMask != null ? masks.heterozygousNoCallMask : null; - BigInteger homoMask = masks.homozygousMask != null? masks.homozygousMask : null; - - - String heteroMaskString = heteroMask != null ? heteroMask.toString(2) : null; - String homoMaskString = homoMask != null ? homoMask.toString(2) : null; + VariantMask heteroMask = masks.heterozygousMask != null? masks.heterozygousMask : masks.heterozygousNoCallMask != null ? masks.heterozygousNoCallMask : null; + VariantMask homoMask = masks.homozygousMask != null? masks.homozygousMask : null; // Patient count = (hetero mask | homo mask) & patient mask - BigInteger heteroOrHomoMask = orNullableMasks(heteroMask, homoMask); - int patientCount = heteroOrHomoMask == null ? 0 : (heteroOrHomoMask.and(patientMasks).bitCount() - 4); + VariantMask heteroOrHomoMask = orNullableMasks(heteroMask, homoMask); + int patientCount = heteroOrHomoMask == null ? 0 : (heteroOrHomoMask.intersection(patientMasks).bitCount() - 4); int bitCount = masks.heterozygousMask == null? 0 : (masks.heterozygousMask.bitCount() - 4); bitCount += masks.homozygousMask == null? 0 : (masks.homozygousMask.bitCount() - 4); //count how many patients have genomic data available - Integer patientsWithVariantsCount = null; - if(heteroMaskString != null) { - patientsWithVariantsCount = heteroMaskString.length() - 4; - } else if (homoMaskString != null ) { - patientsWithVariantsCount = homoMaskString.length() - 4; - } else { - patientsWithVariantsCount = -1; - } + Integer patientsWithVariantsCount = patientMasks.bitCount() - 4; // (patients with/total) in subset \t (patients with/total) out of subset. @@ -299,15 +286,15 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws StringBuilder patientListBuilder = new StringBuilder(); for(Integer patientIndex : patientIndexMap.values()) { - if(heteroMaskString != null && '1' == heteroMaskString.charAt(patientIndex)) { + if(heteroMask != null && heteroMask.testBit(patientIndex)) { patientListBuilder.append("\t0/1"); - }else if(homoMaskString != null && '1' == homoMaskString.charAt(patientIndex)) { + }else if(homoMask != null && homoMask.testBit(patientIndex)) { patientListBuilder.append("\t1/1"); }else { patientListBuilder.append("\t0/0"); } } - builder.append(patientListBuilder.toString()); + builder.append(patientListBuilder); } builder.append("\n"); @@ -317,10 +304,10 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws return builder.toString(); } - private BigInteger orNullableMasks(BigInteger heteroMask, BigInteger homoMask) { + private VariantMask orNullableMasks(VariantMask heteroMask, VariantMask homoMask) { if (heteroMask != null) { if (homoMask != null) { - return heteroMask.or(homoMask); + return heteroMask.union(homoMask); } return heteroMask; } else { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index 46093b8b..eab9dc44 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.BucketIndexBySample; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; @@ -189,7 +190,7 @@ public String[] getPatientIds() { return variantStore.getPatientIds(); } - public Optional getMasks(String variantName, VariantBucketHolder bucketCache) { + public Optional getMasks(String variantName, VariantBucketHolder bucketCache) { try { return variantStore.getMasks(variantName, bucketCache); } catch (IOException e) { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java index 33c6a54b..de12dea0 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java @@ -1,6 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing.genomic; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.processing.DistributableQuery; @@ -34,23 +36,23 @@ public GenomicProcessorRestClient(String serviceUrl) { } @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { - Mono result = webClient.post() + public Mono getPatientMask(DistributableQuery distributableQuery) { + Mono result = webClient.post() .uri("/patients") .contentType(MediaType.APPLICATION_JSON) .body(Mono.just(distributableQuery), DistributableQuery.class) .retrieve() - .bodyToMono(BigInteger.class); + .bodyToMono(VariantMask.class); return result; } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { + public Set patientMaskToPatientIdSet(VariantMask patientMask) { throw new RuntimeException("Not Implemented"); } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { throw new RuntimeException("Not Implemented"); } @@ -77,7 +79,7 @@ public List getPatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { throw new RuntimeException("Not Implemented"); } diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java index 39cb32e9..229a9957 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java @@ -2,6 +2,7 @@ import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -58,8 +59,8 @@ public void getPatientSubsetForQuery_oneVariantCategoryFilter_indexFound() { Query.VariantInfoFilter variantInfoFilter = new Query.VariantInfoFilter(); variantInfoFilter.categoryVariantInfoFilters = categoryVariantInfoFilters; - when(genomicProcessor.getPatientMask(isA(DistributableQuery.class))).thenReturn(Mono.just(new BigInteger("1100110011"))); - when(genomicProcessor.patientMaskToPatientIdSet(eq(new BigInteger("1100110011")))).thenReturn(Set.of(42, 99)); + when(genomicProcessor.getPatientMask(isA(DistributableQuery.class))).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("1100110011")))); + when(genomicProcessor.patientMaskToPatientIdSet(eq(new VariantMaskBitmaskImpl(new BigInteger("1100110011"))))).thenReturn(Set.of(42, 99)); List variantInfoFilters = List.of(variantInfoFilter); diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java index 0ab4a743..77ecff44 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java @@ -1,5 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -50,15 +52,15 @@ public void patientIdInit_patientsDiffer_exception() { @Test public void getPatientMask_validResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110110000011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110001100011", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110000000111", 2))); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110110000011", 2)))); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110001100011", 2)))); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110000000111", 2)))); parentProcessor = new GenomicProcessorParentImpl(List.of( mockProcessor1, mockProcessor2, mockProcessor3 )); - BigInteger patientMask = parentProcessor.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("110111100111", 2); + VariantMask patientMask = parentProcessor.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("110111100111", 2)); assertEquals(expectedPatientMask, patientMask); } } \ No newline at end of file diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java index 06fe6f81..7a64d718 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java @@ -1,5 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -37,33 +39,33 @@ public void setup() { @Test public void getPatientMask_validResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11011011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110001100011", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11000111", 2))); - BigInteger patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("11011000011000000111", 2); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110001100011", 2)))); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); + VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11011000011000000111", 2)); assertEquals(expectedPatientMask, patientMask); } @Test public void getPatientMask_noPatientResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11011011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110000000011", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11000011", 2))); - BigInteger patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("11011000000000000011", 2); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110000000011", 2)))); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000011", 2)))); + VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11011000000000000011", 2)); assertEquals(expectedPatientMask, patientMask); } @Test public void getPatientMask_emptyResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11011011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("1111", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11000111", 2))); - BigInteger patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("110110000111", 2); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("1111", 2)))); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); + VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("110110000111", 2)); assertEquals(expectedPatientMask, patientMask); } diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java index c2d5f252..7b1b745c 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java @@ -1,5 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -8,6 +11,7 @@ import org.springframework.test.context.event.annotation.BeforeTestClass; import java.math.BigInteger; +import java.util.List; import java.util.Optional; import java.util.Set; import java.util.TreeSet; @@ -39,18 +43,18 @@ public void getPatientIdsForIntersectionOfVariantSets_allPatientsMatchOneVariant when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForAllPatients = patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS); - BigInteger maskForNoPatients = patientVariantJoinHandler.createMaskForPatientSet(Set.of()); + VariantMask maskForAllPatients = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS)); + VariantMask maskForNoPatients = VariantMask.emptyInstance(); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForAllPatients; - VariantMasks emptyVariantMasks = new VariantMasks(new String[0]); + VariableVariantMasks emptyVariantMasks = new VariableVariantMasks(); emptyVariantMasks.heterozygousMask = maskForNoPatients; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.of(emptyVariantMasks)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); // this should be all patients, as all patients match one of the variants assertEquals(PATIENT_IDS_INTEGERS, patientIdsForIntersectionOfVariantSets); } @@ -61,18 +65,18 @@ public void getPatientIdsForIntersectionOfVariantSets_allPatientsMatchOneVariant when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForAllPatients = patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS); - BigInteger maskForNoPatients = patientVariantJoinHandler.createMaskForPatientSet(Set.of()); + VariantMask maskForAllPatients = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS)); + VariantMask maskForNoPatients = VariantMask.emptyInstance(); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForAllPatients; - VariantMasks emptyVariantMasks = new VariantMasks(new String[0]); + VariableVariantMasks emptyVariantMasks = new VariableVariantMasks(); emptyVariantMasks.heterozygousMask = maskForNoPatients; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.empty()); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.empty()); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); // this should be all patients, as all patients match one of the variants assertEquals(PATIENT_IDS_INTEGERS, patientIdsForIntersectionOfVariantSets); } @@ -83,14 +87,14 @@ public void getPatientIdsForIntersectionOfVariantSets_noPatientsMatchVariants() when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForNoPatients = patientVariantJoinHandler.createMaskForPatientSet(Set.of()); - VariantMasks emptyVariantMasks = new VariantMasks(new String[0]); + VariantMask maskForNoPatients = VariantMask.emptyInstance(); + VariableVariantMasks emptyVariantMasks = new VariableVariantMasks(); emptyVariantMasks.heterozygousMask = maskForNoPatients; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.of(emptyVariantMasks)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(), intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(), intersectionOfInfoFilters), List.of(PATIENT_IDS)); // this should be empty because all variants masks have no matching patients assertEquals(Set.of(), patientIdsForIntersectionOfVariantSets); } @@ -101,17 +105,16 @@ public void getPatientIdsForIntersectionOfVariantSets_somePatientsMatchVariants( when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - - BigInteger maskForPatients1 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103)); - BigInteger maskForPatients2 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105)); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariantMask maskForPatients1 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103))); + VariantMask maskForPatients2 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105))); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForPatients1; - VariantMasks variantMasks2 = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks2 = new VariableVariantMasks(); variantMasks2.heterozygousMask = maskForPatients2; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(variantMasks2)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); // this should be all patients who match at least one variant assertEquals(Set.of(101, 103, 105), patientIdsForIntersectionOfVariantSets); } @@ -121,7 +124,7 @@ public void getPatientIdsForIntersectionOfVariantSets_noVariants() { VariantIndex intersectionOfInfoFilters = VariantIndex.empty(); when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); // this should be empty, as there are no variants assertEquals(Set.of(), patientIdsForIntersectionOfVariantSets); } @@ -132,16 +135,16 @@ public void getPatientIdsForIntersectionOfVariantSets_patientSubsetPassed() { when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForPatients1 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103, 105)); - BigInteger maskForPatients2 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105, 107)); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariantMask maskForPatients1 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103, 105))); + VariantMask maskForPatients2 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105, 107))); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForPatients1; - VariantMasks variantMasks2 = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks2 = new VariableVariantMasks(); variantMasks2.heterozygousMask = maskForPatients2; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(variantMasks2)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(102, 103, 104, 105, 106), intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(102, 103, 104, 105, 106), intersectionOfInfoFilters), List.of(PATIENT_IDS)); // this should be the union of patients matching variants (101, 103, 105, 107), intersected with the patient subset parameter (103, 104, 105) which is (103, 105) assertEquals(Set.of(103, 105), patientIdsForIntersectionOfVariantSets); } @@ -153,16 +156,4 @@ public BigInteger emptyBitmask(String[] patientIds) { } return new BigInteger("11" + emptyVariantMask + "11", 2); } - - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new TreeSet(); - String bitmaskString = patientMask.toString(2); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = variantService.getPatientIds()[x-2].trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; - } } \ No newline at end of file diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java index 9381ac76..5ded227e 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing.genomic; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; import edu.harvard.hms.dbmi.avillach.hpds.processing.DistributableQuery; import org.junit.jupiter.api.Disabled; @@ -36,7 +37,7 @@ public void simpleTest() { Set patientIds = IntStream.range(53000, 53635).boxed().collect(Collectors.toSet()); distributableQuery.setPatientIds(patientIds); - BigInteger patientMaskForVariantInfoFilters = genomicProcessorRestClient.getPatientMask(distributableQuery).block(); + VariantMask patientMaskForVariantInfoFilters = genomicProcessorRestClient.getPatientMask(distributableQuery).block(); System.out.println(patientMaskForVariantInfoFilters); } From 7e4a3f1027c2842c04448c233d9ef7fdcc375570 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Feb 2024 21:23:40 -0500 Subject: [PATCH 06/35] ALS-5819: Save empty variant masks as null --- .../dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 5d8efd3d..09ecb30f 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -78,7 +78,7 @@ public VariableVariantMasks(char[][] maskValues) { private VariantMask variantMaskFromRawString(String maskStringRaw) { if (!maskStringRaw.contains("1")) { - return new VariantMaskSparseImpl(Set.of()); + return null; } VariantMask variantMask; From c9a75792d5b65ff1c07af03d82c6881ae0da6b12 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Feb 2024 21:51:42 -0500 Subject: [PATCH 07/35] ALS-5819: Do not serialize empty variant masks --- .../data/genotype/VariableVariantMasks.java | 22 ++++++++----------- .../data/genotype/VariantMaskSparseImpl.java | 14 ++++++++++++ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 09ecb30f..ea259f49 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -1,5 +1,6 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; +import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import java.io.Serializable; @@ -105,33 +106,36 @@ public VariableVariantMasks(int length) { } @JsonProperty("ho") + @JsonInclude(JsonInclude.Include.NON_NULL) public VariantMask homozygousMask; @JsonProperty("he") + @JsonInclude(JsonInclude.Include.NON_NULL) public VariantMask heterozygousMask; @JsonProperty("hon") + @JsonInclude(JsonInclude.Include.NON_NULL) public VariantMask homozygousNoCallMask; @JsonProperty("hen") + @JsonInclude(JsonInclude.Include.NON_NULL) public VariantMask heterozygousNoCallMask; @JsonProperty("l") public int length; - @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; VariableVariantMasks that = (VariableVariantMasks) o; - return Objects.equals(homozygousMask, that.homozygousMask) && Objects.equals(heterozygousMask, that.heterozygousMask) && Objects.equals(homozygousNoCallMask, that.homozygousNoCallMask) && Objects.equals(heterozygousNoCallMask, that.heterozygousNoCallMask); + return length == that.length && Objects.equals(homozygousMask, that.homozygousMask) && Objects.equals(heterozygousMask, that.heterozygousMask) && Objects.equals(homozygousNoCallMask, that.homozygousNoCallMask) && Objects.equals(heterozygousNoCallMask, that.heterozygousNoCallMask); } @Override public int hashCode() { - return Objects.hash(homozygousMask, heterozygousMask, homozygousNoCallMask, heterozygousNoCallMask); + return Objects.hash(homozygousMask, heterozygousMask, homozygousNoCallMask, heterozygousNoCallMask, length); } public static BigInteger emptyBitmask(int length) { @@ -248,14 +252,6 @@ public static Set patientMaskToPatientIdSet(VariantMask patientMask, Li throw new IllegalArgumentException("Unknown VariantMask implementation"); } -/* if (mask1 == null) { - mask1 = variantStore1.emptyBitmask(); - } - if (mask2 == null) { - mask2 = variantStore2.emptyBitmask(); - } - String binaryMask1 = mask1.toString(2); - String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2);*/ + + } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java index 215f5088..0c6c5756 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -3,6 +3,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import java.util.HashSet; +import java.util.Objects; import java.util.Set; public class VariantMaskSparseImpl implements VariantMask { @@ -49,4 +50,17 @@ private VariantMask union(VariantMaskSparseImpl variantMask) { union.addAll(this.patientIndexes); return new VariantMaskSparseImpl(union); } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantMaskSparseImpl that = (VariantMaskSparseImpl) o; + return Objects.equals(patientIndexes, that.patientIndexes); + } + + @Override + public int hashCode() { + return Objects.hash(patientIndexes); + } } From bdfb093c08cf1e40ce4cee2de14cc6e19e5eb12d Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 26 Feb 2024 09:24:13 -0500 Subject: [PATCH 08/35] ALS-5819: Implement intersection. Some refactoring and unit testing too --- .../data/genotype/VariableVariantMasks.java | 57 +++++++------ .../hpds/data/genotype/VariantMask.java | 9 +- .../data/genotype/VariantMaskBitmaskImpl.java | 17 +++- .../data/genotype/VariantMaskSparseImpl.java | 16 +++- .../hpds/data/genotype/VariantMaskTest.java | 83 +++++++++++++++++++ 5 files changed, 145 insertions(+), 37 deletions(-) create mode 100644 data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskTest.java diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index ea259f49..a5cf446f 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -152,28 +152,6 @@ public static BigInteger emptyBitmask(int length) { return emptyBitmask; } - /** - * Appends one mask to another. This assumes the masks are both padded with '11' on each end - * to prevent overflow issues. - */ - public BigInteger appendMask(BigInteger mask1, int mask1Length, BigInteger mask2, int mask2length) { - if (mask1 == null && mask2 == null) { - return null; - } - if (mask1 == null) { - // todo: unit test this funcitonality - mask1 = emptyBitmask(mask1Length); - } - if (mask2 == null) { - mask2 = emptyBitmask(mask2length); - } - String binaryMask1 = mask1.toString(2); - String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2); - return new BigInteger(appendedString, 2); - } - public VariableVariantMasks append(VariableVariantMasks variantMasks) { VariableVariantMasks appendedMasks = new VariableVariantMasks(); appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask, this.length, variantMasks.length); @@ -193,7 +171,15 @@ public static VariantMask appendMask(VariantMask variantMask1, VariantMask varia throw new RuntimeException("Unknown VariantMask implementation"); } } - // todo: bitmask + else if (variantMask1 instanceof VariantMaskBitmaskImpl) { + if (variantMask2 instanceof VariantMaskSparseImpl) { + return append((VariantMaskBitmaskImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); + } else if (variantMask2 instanceof VariantMaskBitmaskImpl) { + return append((VariantMaskBitmaskImpl) variantMask1, (VariantMaskBitmaskImpl) variantMask2, length1, length2); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } else { throw new RuntimeException("Unknown VariantMask implementation"); } @@ -210,9 +196,32 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas binaryMask1.substring(2); return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); } + + private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { + String binaryMask1 = variantMask1.bitmask.toString(2); + + BigInteger mask2 = emptyBitmask(length2); + for (Integer patientId : variantMask2.patientIndexes) { + mask2 = mask2.setBit(patientId); + } + String binaryMask2 = mask2.toString(2); + + String appendedString = binaryMask2.substring(0, binaryMask1.length() - 2) + + binaryMask1.substring(2); + return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + } + + private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { + String binaryMask1 = variantMask1.bitmask.toString(2); + String binaryMask2 = variantMask2.bitmask.toString(2); + + String appendedString = binaryMask2.substring(0, binaryMask1.length() - 2) + + binaryMask1.substring(2); + return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + } + private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { if (variantMask1.patientIndexes.size() + variantMask2.patientIndexes.size() > SPARSE_VARIANT_THRESHOLD) { - // todo: performance test this vs byte array BigInteger mask = emptyBitmask(length1 + length2); for (Integer patientId : variantMask1.patientIndexes) { mask = mask.setBit(patientId + 2); diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java index 46b24e6a..c170dce6 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java @@ -5,6 +5,7 @@ import java.math.BigInteger; import java.util.Set; +import java.util.stream.Collectors; @JsonTypeInfo( use = JsonTypeInfo.Id.NAME, @@ -29,12 +30,4 @@ public interface VariantMask { static VariantMask emptyInstance() { return new VariantMaskSparseImpl(Set.of()); } - - static VariantMask union(VariantMaskSparseImpl variantMaskSparse, VariantMaskBitmaskImpl variantMaskBitmask) { - BigInteger union = variantMaskBitmask.bitmask; - for (Integer patientId : variantMaskSparse.patientIndexes) { - union = union.setBit(patientId + 2); - } - return new VariantMaskBitmaskImpl(union); - } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java index 532731bc..465528a5 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -26,7 +26,13 @@ public VariantMaskBitmaskImpl(@JsonProperty("mask") BigInteger bitmask) { @Override public VariantMask intersection(VariantMask variantMask) { - throw new RuntimeException("Not implemented yet"); + if (variantMask instanceof VariantMaskBitmaskImpl) { + return intersection((VariantMaskBitmaskImpl) variantMask); + } else if (variantMask instanceof VariantMaskSparseImpl) { + return variantMask.intersection(this); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } } @Override @@ -34,7 +40,7 @@ public VariantMask union(VariantMask variantMask) { if (variantMask instanceof VariantMaskBitmaskImpl) { return union((VariantMaskBitmaskImpl) variantMask); } else if (variantMask instanceof VariantMaskSparseImpl) { - return VariantMask.union((VariantMaskSparseImpl) variantMask, this); + return variantMask.union(this); } else { throw new RuntimeException("Unknown VariantMask implementation"); } @@ -42,7 +48,7 @@ public VariantMask union(VariantMask variantMask) { @Override public boolean testBit(int bit) { - return bitmask.testBit(bit); + return bitmask.testBit(bit + 2); } @Override @@ -51,6 +57,11 @@ public int bitCount() { } private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) { + return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.or(this.bitmask)); + } + private VariantMask intersection(VariantMaskBitmaskImpl variantMaskBitmask) { + // we could consider using a sparse variant index here if we are ever going to be storing the + // result of this anywhere return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.and(this.bitmask)); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java index 0c6c5756..0b72c63a 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -2,9 +2,11 @@ import com.fasterxml.jackson.annotation.JsonProperty; +import java.math.BigInteger; import java.util.HashSet; import java.util.Objects; import java.util.Set; +import java.util.stream.Collectors; public class VariantMaskSparseImpl implements VariantMask { @@ -21,13 +23,15 @@ public Set getPatientIndexes() { @Override public VariantMask intersection(VariantMask variantMask) { - throw new RuntimeException("Not implemented yet"); + return new VariantMaskSparseImpl(this.patientIndexes.stream() + .filter(variantMask::testBit) + .collect(Collectors.toSet())); } @Override public VariantMask union(VariantMask variantMask) { if (variantMask instanceof VariantMaskBitmaskImpl) { - return VariantMask.union(this, (VariantMaskBitmaskImpl) variantMask); + return union((VariantMaskBitmaskImpl) variantMask); } else if (variantMask instanceof VariantMaskSparseImpl) { return union((VariantMaskSparseImpl) variantMask); } else { @@ -51,6 +55,14 @@ private VariantMask union(VariantMaskSparseImpl variantMask) { return new VariantMaskSparseImpl(union); } + private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) { + BigInteger union = variantMaskBitmask.bitmask; + for (Integer patientId : this.patientIndexes) { + union = union.setBit(patientId + 2); + } + return new VariantMaskBitmaskImpl(union); + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskTest.java new file mode 100644 index 00000000..d6ad8b4a --- /dev/null +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskTest.java @@ -0,0 +1,83 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import org.junit.jupiter.api.Test; + +import java.math.BigInteger; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class VariantMaskTest { + + @Test + public void intersection_bitmaskVsBitmask() { + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111001100011", 2)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("111010010011", 2)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111000000011", 2)); + + assertEquals(expected, mask1.intersection(mask2)); + } + @Test + public void intersection_bitmaskVsSparse() { + // this is essentially a mask for patients 0, 2, 3, 7 (there is 11 padding on both ends) + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111000110111", 2)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(0, 3, 6)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(0, 3)); + + assertEquals(expected, mask1.intersection(mask2)); + } + @Test + public void intersection_sparseVsBitmask() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(4, 7)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("110111110111", 2)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(4)); + + assertEquals(expected, mask1.intersection(mask2)); + } + + @Test + public void intersection_sparseVsSparse() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(0, 2, 4, 6)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(0, 1, 3, 5, 7)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(0)); + + assertEquals(expected, mask1.intersection(mask2)); + } + + @Test + public void union_bitmaskVsBitmask() { + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111001100011", 2)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("111010010011", 2)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111011110011", 2)); + + assertEquals(expected, mask1.union(mask2)); + } + + @Test + public void union_bitmaskVsSparse() { + // this is essentially a mask for patients 0, 2, 3, 7 (there is 11 padding on both ends) + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111000110111", 2)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(0, 3, 6)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111100110111", 2)); + + assertEquals(expected, mask1.union(mask2)); + } + + @Test + public void union_sparseVsBitmask() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(4, 7)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("110111110111", 2)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111111110111", 2)); + + assertEquals(expected, mask1.union(mask2)); + } + + @Test + public void union_sparseVsSparse() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(0, 2, 4, 6)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 5, 7)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(0, 1, 2, 4, 5, 6, 7)); + + assertEquals(expected, mask1.union(mask2)); + } +} From 2d6d3e79f4170d1807a8db4fa40298ff7f93ed6f Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 26 Feb 2024 10:46:17 -0500 Subject: [PATCH 09/35] ALS-5981: Add gene and consequence to variant spec --- .../hpds/data/genotype/VariantSpec.java | 62 +++++++++++++++++-- .../hpds/etl/genotype/NewVCFLoader.java | 13 +++- 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java index b244e407..83359789 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; import java.io.Serializable; +import java.util.Objects; import org.apache.commons.csv.CSVRecord; @@ -13,7 +14,22 @@ public class VariantCoords implements Serializable { public String ref; public String alt; public int qual; - public String format; + public String format; + public String gene; + public String consequence; + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantCoords that = (VariantCoords) o; + return qual == that.qual && Objects.equals(chromosome, that.chromosome) && Objects.equals(offset, that.offset) && Objects.equals(name, that.name) && Objects.equals(ref, that.ref) && Objects.equals(alt, that.alt) && Objects.equals(format, that.format) && Objects.equals(gene, that.gene) && Objects.equals(consequence, that.consequence); + } + + @Override + public int hashCode() { + return Objects.hash(chromosome, offset, name, ref, alt, qual, format, gene, consequence); + } } public static int CHR = 0, OFF = 1, NAME = 2, REF = 3, ALT = 4, QUAL = 5, FILTER = 6, INFO = 7, FORMAT = 8, DATA = 9; public long heteroOffset; @@ -32,6 +48,20 @@ public VariantSpec(CSVRecord r) { }catch(NumberFormatException e) { this.metadata.qual = -1; } + + String[] variantInfo = r.get(INFO).split("[=;]"); + String gene = "NULL"; + String consequence = "NULL"; + for (int i = 0; i < variantInfo.length; i = i + 2) { + if ("Gene_with_variant".equals(variantInfo[i])) { + gene = variantInfo[i + 1]; + } + if ("Variant_consequence_calculated".equals(variantInfo[i])) { + consequence = variantInfo[i + 1]; + } + } + this.metadata.gene = gene; + this.metadata.consequence = consequence; } public VariantSpec(String variant) { @@ -43,28 +73,48 @@ public VariantSpec(String variant) { this.metadata.ref = segments[2]; this.metadata.alt = segments[3]; this.metadata.qual = -1; + this.metadata.gene = segments[4]; + this.metadata.consequence = segments[5]; } public String specNotation() { return this.metadata.chromosome + "," + this.metadata.offset + "," + - this.metadata.ref + "," + this.metadata.alt; + this.metadata.ref + "," + this.metadata.alt + "," + this.metadata.gene + "," + this.metadata.consequence; } - @Override public int compareTo(VariantSpec o) { int ret = 0; ret = this.metadata.chromosome.compareTo(o.metadata.chromosome); - if(ret == 0) { + if (ret == 0) { ret = this.metadata.offset.compareTo(o.metadata.offset); } - if(ret == 0) { + if (ret == 0) { ret = this.metadata.ref.compareTo(o.metadata.ref); } - if(ret == 0) { + if (ret == 0) { ret = this.metadata.alt.compareTo(o.metadata.alt); } + if (ret == 0) { + ret = this.metadata.gene.compareTo(o.metadata.gene); + } + if (ret == 0) { + ret = this.metadata.consequence.compareTo(o.metadata.consequence); + } + return ret; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantSpec that = (VariantSpec) o; + return heteroOffset == that.heteroOffset && homoOffset == that.homoOffset && Objects.equals(metadata, that.metadata); + } + + @Override + public int hashCode() { + return Objects.hash(heteroOffset, homoOffset, metadata); + } } \ No newline at end of file diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 43f6c351..996c4078 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -470,8 +470,19 @@ private void setMasksForSample(char[][] zygosityMaskStrings, int index, int star } private String currentSpecNotation() { + String[] variantInfo = currentLineSplit[7].split("[=;]"); + String gene = "NULL"; + String consequence = "NULL"; + for (int i = 0; i < variantInfo.length; i = i + 2) { + if ("Gene_with_variant".equals(variantInfo[i])) { + gene = variantInfo[i + 1]; + } + if ("Variant_consequence_calculated".equals(variantInfo[i])) { + consequence = variantInfo[i + 1]; + } + } return currentLineSplit[0] + "," + currentLineSplit[1] + "," + currentLineSplit[3] + "," - + currentLineSplit[4]; + + currentLineSplit[4] + "," + gene + "," + consequence; } public void readHeaders(ConcurrentHashMap infoStores) throws IOException { From eb83a2cc7bc59597f2659b34d5f0b4da654075fe Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 28 Feb 2024 14:48:03 -0500 Subject: [PATCH 10/35] ALS-5981: Handle null variant masks in merge0 --- .../avillach/hpds/data/genotype/VariableVariantMasks.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index a5cf446f..0bfc8d59 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -162,6 +162,12 @@ public VariableVariantMasks append(VariableVariantMasks variantMasks) { } public static VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask2, int length1, int length2) { + if (variantMask1 == null) { + variantMask1 = VariantMask.emptyInstance(); + } + if (variantMask2 == null) { + variantMask2 = VariantMask.emptyInstance(); + } if (variantMask1 instanceof VariantMaskSparseImpl) { if (variantMask2 instanceof VariantMaskSparseImpl) { return append((VariantMaskSparseImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); From 5a3b64aa6069334f99e29feabd0450a7c6ef89bc Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 28 Feb 2024 15:06:59 -0500 Subject: [PATCH 11/35] ALS-5981: Fix refactor typo --- .../avillach/hpds/data/genotype/VariableVariantMasks.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 0bfc8d59..0bec6b6d 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -198,7 +198,7 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas } String binaryMask1 = mask1.toString(2); String binaryMask2 = variantMask2.bitmask.toString(2); - String appendedString = binaryMask2.substring(0, binaryMask1.length() - 2) + + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + binaryMask1.substring(2); return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); } @@ -212,7 +212,7 @@ private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMa } String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask2.substring(0, binaryMask1.length() - 2) + + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + binaryMask1.substring(2); return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); } @@ -221,7 +221,7 @@ private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMa String binaryMask1 = variantMask1.bitmask.toString(2); String binaryMask2 = variantMask2.bitmask.toString(2); - String appendedString = binaryMask2.substring(0, binaryMask1.length() - 2) + + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + binaryMask1.substring(2); return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); } From 2a928e2a5d494978e42e5f4bf529acccbe7aaaab Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 28 Feb 2024 20:25:07 -0500 Subject: [PATCH 12/35] Set genomic config --- service/src/main/resources/application-development.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/src/main/resources/application-development.properties b/service/src/main/resources/application-development.properties index 860f5d64..16c335d2 100644 --- a/service/src/main/resources/application-development.properties +++ b/service/src/main/resources/application-development.properties @@ -2,5 +2,5 @@ SMALL_JOB_LIMIT = 100 SMALL_TASK_THREADS = 1 LARGE_TASK_THREADS = 1 -hpds.genomicProcessor.impl=localPatientDistributed +hpds.genomicProcessor.impl=localDistributed HPDS_GENOMIC_DATA_DIRECTORY=/opt/local/hpds/all/ \ No newline at end of file From 2b4b29ea441d1998f69ab8e8fe55bf51a9033180 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 29 Feb 2024 10:49:20 -0500 Subject: [PATCH 13/35] ALS-5981: Remove parallelization that may be causing a race condition losing data --- .../dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 09501345..29dcaf15 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -117,7 +117,7 @@ public Map mergeVariantIndexes() throws throw new IllegalStateException("Info stores do not match"); } //for (Map.Entry infoStores1Entry : infoStores1.entrySet()) { - infoStores1.entrySet().parallelStream().forEach(infoStores1Entry -> { + infoStores1.entrySet().stream().forEach(infoStores1Entry -> { FileBackedByteIndexedInfoStore infoStore2 = infoStores2.get(infoStores1Entry.getKey()); FileBackedByteIndexedStorage allValuesStore1 = infoStores1Entry.getValue().getAllValues(); @@ -222,7 +222,7 @@ public FileBackedJsonIndexStorage> variantMaskStorage2 = variantStore2.getVariantMaskStorage().get(chromosome); FileBackedJsonIndexStorage> merged = new FileBackedStorageVariantMasksImpl(new File(outputDirectory + chromosome + "masks.bin")); - variantMaskStorage1.keys().parallelStream().forEach(key -> { + variantMaskStorage1.keys().stream().forEach(key -> { Map masks1 = variantMaskStorage1.get(key); Map masks2 = variantMaskStorage2.get(key); if (masks2 == null) { From 3ef57d05bfdfeb75d58c0b65123752c729ab8976 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 29 Feb 2024 15:21:53 -0500 Subject: [PATCH 14/35] ALS-5981: Fix possible bug where small amount of data is being dropped --- .../hpds/etl/genotype/GenomicDatasetMerger.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 29dcaf15..ea94f7c7 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -249,13 +249,16 @@ public FileBackedJsonIndexStorage mergedMasks = new ConcurrentHashMap<>(); variantMaskStorage2.keys().forEach(key -> { - Map masks2 = variantMaskStorage2.get(key); - for (Map.Entry entry : masks2.entrySet()) { - if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); + if (variantMaskStorage1.get(key) == null) { + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + Map masks2 = variantMaskStorage2.get(key); + for (Map.Entry entry : masks2.entrySet()) { + if (!mergedMasks.containsKey(entry.getKey())) { + mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); + } } + merged.put(key, mergedMasks); } }); return merged; From c0a944508c1b051d29428530fe2ffe67f2cf5b11 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 1 Mar 2024 06:38:36 -0500 Subject: [PATCH 15/35] Add loging --- .../dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index ea94f7c7..ae2c580a 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -226,6 +226,7 @@ public FileBackedJsonIndexStorage masks1 = variantMaskStorage1.get(key); Map masks2 = variantMaskStorage2.get(key); if (masks2 == null) { + log.info("Key " + key + " not found in dataset 2"); masks2 = Map.of(); } @@ -251,6 +252,7 @@ public FileBackedJsonIndexStorage { if (variantMaskStorage1.get(key) == null) { + log.info("Key " + key + " not found in dataset 1"); ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); Map masks2 = variantMaskStorage2.get(key); for (Map.Entry entry : masks2.entrySet()) { From 90dcb1e94342a88f3287a1606712f52d0155dd1e Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 1 Mar 2024 10:34:37 -0500 Subject: [PATCH 16/35] ALS-5981: Fix bigint deserialization issue --- .../avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java index 465528a5..7e5fc81a 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -20,7 +20,11 @@ public BigInteger getBitmask() { } @JsonCreator - public VariantMaskBitmaskImpl(@JsonProperty("mask") BigInteger bitmask) { + public VariantMaskBitmaskImpl(@JsonProperty("mask") String stringMask) { + this.bitmask = new BigInteger(stringMask); + } + + public VariantMaskBitmaskImpl(BigInteger bitmask) { this.bitmask = bitmask; } From 7a82cd5564282a1d3fede87c7cf9bd4a557f3aa1 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 13 Mar 2024 16:41:10 -0300 Subject: [PATCH 17/35] Add logging --- .../hpds/etl/genotype/GenomicDatasetMerger.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index ae2c580a..e86da052 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -226,7 +226,6 @@ public FileBackedJsonIndexStorage masks1 = variantMaskStorage1.get(key); Map masks2 = variantMaskStorage2.get(key); if (masks2 == null) { - log.info("Key " + key + " not found in dataset 2"); masks2 = Map.of(); } @@ -247,12 +246,15 @@ public FileBackedJsonIndexStorage { if (variantMaskStorage1.get(key) == null) { - log.info("Key " + key + " not found in dataset 1"); ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); Map masks2 = variantMaskStorage2.get(key); for (Map.Entry entry : masks2.entrySet()) { @@ -260,7 +262,11 @@ public FileBackedJsonIndexStorage Date: Thu, 14 Mar 2024 15:06:08 -0300 Subject: [PATCH 18/35] Add logging to debug missing variants --- .../avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index e86da052..a7768b49 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -1,5 +1,6 @@ package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; +import com.google.common.base.Joiner; import com.google.common.collect.Sets; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantMasksImpl; @@ -249,6 +250,9 @@ public FileBackedJsonIndexStorage Date: Thu, 14 Mar 2024 15:11:49 -0300 Subject: [PATCH 19/35] Save empty variant masks as null --- .../avillach/hpds/data/genotype/VariableVariantMasks.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 0bec6b6d..6d831223 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -1,5 +1,6 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; @@ -121,6 +122,7 @@ public VariableVariantMasks(int length) { @JsonInclude(JsonInclude.Include.NON_NULL) public VariantMask heterozygousNoCallMask; + @JsonIgnore @JsonProperty("l") public int length; @@ -168,6 +170,10 @@ public static VariantMask appendMask(VariantMask variantMask1, VariantMask varia if (variantMask2 == null) { variantMask2 = VariantMask.emptyInstance(); } + if (variantMask1.equals(VariantMask.emptyInstance()) && variantMask2.equals(VariantMask.emptyInstance())) { + return null; + } + if (variantMask1 instanceof VariantMaskSparseImpl) { if (variantMask2 instanceof VariantMaskSparseImpl) { return append((VariantMaskSparseImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); From 3cd01c31da594ac80b34924d2ba869a55837b225 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Sat, 16 Mar 2024 10:28:05 -0300 Subject: [PATCH 20/35] Handle null variant masks --- .../processing/PatientVariantJoinHandler.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java index 01d11150..5c504b4d 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java @@ -81,12 +81,18 @@ public VariantMask getPatientIdsForIntersectionOfVariantSets(Set patien variantBucket.forEach(variantSpec -> { Optional variantMask = variantService.getMasks(variantSpec, bucketCache); variantMask.ifPresentOrElse(masks -> { - VariantMask heteroMask = masks.heterozygousMask; - VariantMask homoMask = masks.homozygousMask; - VariantMask orMasks = heteroMask.union(homoMask); + VariantMask heterozygousMask = masks.heterozygousMask; + VariantMask homozygousMask = masks.homozygousMask; //log.info("Patients with variant " + variantSpec + ": " + (orMasks.bitCount() - 4)); - synchronized(matchingPatients) { - matchingPatients[0] = matchingPatients[0].union(orMasks); + if (heterozygousMask != null) { + synchronized(matchingPatients) { + matchingPatients[0] = matchingPatients[0].union(heterozygousMask); + } + } + if (homozygousMask != null) { + synchronized(matchingPatients) { + matchingPatients[0] = matchingPatients[0].union(homozygousMask); + } } }, () -> missingVariants.add(variantSpec)); }); From 60b16babdc5d7f7c1268bf8bf0583559919fab13 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 22 Mar 2024 19:26:15 -0400 Subject: [PATCH 21/35] Fix bug where set based masks were not getting offest when merged --- .../data/genotype/VariableVariantMasks.java | 17 ++++++++-- .../etl/genotype/GenomicDatasetMerger.java | 32 +++++++++++++------ 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 6d831223..17bfeea2 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -154,13 +154,22 @@ public static BigInteger emptyBitmask(int length) { return emptyBitmask; } - public VariableVariantMasks append(VariableVariantMasks variantMasks) { + /*public VariableVariantMasks append(VariableVariantMasks variantMasks) { VariableVariantMasks appendedMasks = new VariableVariantMasks(); appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask, this.length, variantMasks.length); appendedMasks.heterozygousMask = appendMask(this.heterozygousMask, variantMasks.heterozygousMask, this.length, variantMasks.length); appendedMasks.homozygousNoCallMask = appendMask(this.homozygousNoCallMask, variantMasks.homozygousNoCallMask, this.length, variantMasks.length); appendedMasks.heterozygousNoCallMask = appendMask(this.heterozygousNoCallMask, variantMasks.heterozygousNoCallMask, this.length, variantMasks.length); return appendedMasks; + }*/ + + public static VariableVariantMasks append(VariableVariantMasks masks1, int length1, VariableVariantMasks masks2, int length2) { + VariableVariantMasks appendedMasks = new VariableVariantMasks(); + appendedMasks.homozygousMask = appendMask(masks1.homozygousMask, masks2.homozygousMask, length1, length2); + appendedMasks.heterozygousMask = appendMask(masks1.heterozygousMask, masks2.heterozygousMask, length1, length2); + appendedMasks.homozygousNoCallMask = appendMask(masks1.homozygousNoCallMask, masks2.homozygousNoCallMask, length1, length2); + appendedMasks.heterozygousNoCallMask = appendMask(masks1.heterozygousNoCallMask, masks2.heterozygousNoCallMask, length1, length2); + return appendedMasks; } public static VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask2, int length1, int length2) { @@ -238,7 +247,7 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas for (Integer patientId : variantMask1.patientIndexes) { mask = mask.setBit(patientId + 2); } - // todo: explain this. it is not intuitive + // We start writing mask 2 where mask 1 ends. So the 0th index of mask 2 is now following the last bit of mask 1 for (Integer patientId : variantMask2.patientIndexes) { mask = mask.setBit(patientId + length1 + 2); } @@ -247,7 +256,9 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas else { Set patientIndexSet = new HashSet<>(); patientIndexSet.addAll(variantMask1.patientIndexes); - patientIndexSet.addAll(variantMask2.patientIndexes); + // The indexes for mask 2 are shifted by the length of mask 1, corresponding to the corresponding patient id array + // for mask 2 being appended to those of mask 1 + patientIndexSet.addAll(variantMask2.patientIndexes.stream().map(i -> i + length1).collect(Collectors.toSet())); return new VariantMaskSparseImpl(patientIndexSet); } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index a7768b49..7cc8aa7e 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -10,7 +10,6 @@ import org.slf4j.LoggerFactory; import java.io.*; -import java.math.BigInteger; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListSet; @@ -24,6 +23,9 @@ public class GenomicDatasetMerger { private final VariantStore variantStore1; private final VariantStore variantStore2; + private final int variantStore1PatientCount; + private final int variantStore2PatientCount; + private final Map infoStores1; private final Map infoStores2; @@ -33,7 +35,9 @@ public class GenomicDatasetMerger { public GenomicDatasetMerger(VariantStore variantStore1, VariantStore variantStore2, Map infoStores1, Map infoStores2, String outputDirectory) { this.variantStore1 = variantStore1; + this.variantStore1PatientCount = variantStore1.getPatientIds().length; this.variantStore2 = variantStore2; + this.variantStore2PatientCount = variantStore2.getPatientIds().length; this.mergedVariantStore = new VariantStore(); this.infoStores1 = infoStores1; this.infoStores2 = infoStores2; @@ -238,21 +242,21 @@ public FileBackedJsonIndexStorage entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); + VariableVariantMasks appendedMasks = VariableVariantMasks.append(new VariableVariantMasks(), variantStore1PatientCount, entry.getValue(), variantStore2PatientCount); + mergedMasks.put(entry.getKey(), appendedMasks); } } if (merged.keys().contains(key)) { log.warn("Merged already contains key: " + key); } else { - if (key == 61713) { - log.info("Loop 1 adding masks to key 61713: " + Joiner.on(",").join(mergedMasks.keySet())); - } merged.put(key, mergedMasks); } }); @@ -263,19 +267,27 @@ public FileBackedJsonIndexStorage masks2 = variantMaskStorage2.get(key); for (Map.Entry entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); + VariableVariantMasks appendedMasks = VariableVariantMasks.append(new VariableVariantMasks(), variantStore1PatientCount, entry.getValue(), variantStore2PatientCount); + mergedMasks.put(entry.getKey(), appendedMasks); } } if (merged.keys().contains(key)) { log.warn("Second loop: merged already contains key: " + key); } else { - if (key == 61713) { - log.info("Loop 2 adding masks to key 61713: " + Joiner.on(",").join(mergedMasks.keySet())); - } merged.put(key, mergedMasks); } } }); + + merged.keys().stream().sorted().limit(3).forEach(key -> { + ConcurrentHashMap maskMap = merged.get(key); + maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { + VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); + Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(variableVariantMasks.heterozygousMask.union(variableVariantMasks.homozygousMask), Arrays.asList(mergedVariantStore.getPatientIds())); + log.info("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); + }); + + }); return merged; } From da5a4b182c40a4b766f447a92729612ac59c1ae0 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 22 Mar 2024 19:39:59 -0400 Subject: [PATCH 22/35] Fix NPE in logging --- .../dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 7cc8aa7e..e9d90cb4 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -283,7 +283,8 @@ public FileBackedJsonIndexStorage maskMap = merged.get(key); maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); - Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(variableVariantMasks.heterozygousMask.union(variableVariantMasks.homozygousMask), Arrays.asList(mergedVariantStore.getPatientIds())); + VariantMask union = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()).union(Optional.ofNullable(variableVariantMasks.homozygousMask).orElse(VariantMask.emptyInstance())); + Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(union, Arrays.asList(mergedVariantStore.getPatientIds())); log.info("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); }); From 1f167767fcc220522d7dbbbef9aa3b56d2370a93 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 22 Mar 2024 19:47:15 -0400 Subject: [PATCH 23/35] Fix NPE in logging --- .../dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 4 +++- .../hpds/etl/genotype/GenomicDatasetMergerRunner.java | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index e9d90cb4..15fd89eb 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -60,9 +60,11 @@ private void validate() { public VariantStore mergeVariantStore(Map>> mergedChromosomeMasks) { mergedVariantStore.setVariantMaskStorage(mergedChromosomeMasks); - mergedVariantStore.setPatientIds(mergePatientIds()); return mergedVariantStore; } + public void mergePatients() { + mergedVariantStore.setPatientIds(mergePatientIds()); + } /** * Since both sets of variant indexes reference a different variant spec list, we need to re-index diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java index 3559790e..ba3b2936 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java @@ -46,6 +46,7 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio Map infoStores2 = loadInfoStores(genomicDirectory2); GenomicDatasetMerger genomicDatasetMerger = new GenomicDatasetMerger(VariantStore.readInstance(genomicDirectory1),VariantStore.readInstance(genomicDirectory2), infoStores1, infoStores2, outputDirectory); + genomicDatasetMerger.mergePatients(); Map>> mergedChromosomeMasks = genomicDatasetMerger.mergeChromosomeMasks(); VariantStore mergedVariantStore = genomicDatasetMerger.mergeVariantStore(mergedChromosomeMasks); From 0ac2ce0f3cdc83f2f9f4cfe76b0887ed321abf7a Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 22 Mar 2024 21:40:55 -0400 Subject: [PATCH 24/35] Add logging for specific use case --- .../data/genotype/VariableVariantMasks.java | 5 +++++ .../data/genotype/VariantMaskBitmaskImpl.java | 7 +++++++ .../etl/genotype/GenomicDatasetMerger.java | 18 ++++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 17bfeea2..3133caf2 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -3,6 +3,8 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.Serializable; import java.math.BigInteger; @@ -11,6 +13,9 @@ import java.util.stream.Collectors; public class VariableVariantMasks implements Serializable { + + private static Logger log = LoggerFactory.getLogger(VariableVariantMasks.class); + private static final long serialVersionUID = 6225420483804601477L; private static final String oneone = "11"; private static final char one = '1'; diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java index 7e5fc81a..316f9e9c 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -81,4 +81,11 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(bitmask); } + + @Override + public String toString() { + return "VariantMaskBitmaskImpl{" + + "bitmask=" + bitmask.toString() + + '}'; + } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 15fd89eb..e9bc26e2 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -246,6 +246,20 @@ public FileBackedJsonIndexStorage patientIds1 = VariableVariantMasks.patientMaskToPatientIdSet(Optional.ofNullable(entry.getValue().heterozygousMask).orElse(VariantMask.emptyInstance()), Arrays.asList(variantStore1.getPatientIds())); + log.info("Patients 1: " + Joiner.on(",").join(patientIds1)); + + Set patientIds2 = VariableVariantMasks.patientMaskToPatientIdSet(Optional.ofNullable(variantMasks2.heterozygousMask).orElse(VariantMask.emptyInstance()), Arrays.asList(variantStore2.getPatientIds())); + log.info("Patients 2: " + Joiner.on(",").join(patientIds2)); + + Set mergedPatientIds = VariableVariantMasks.patientMaskToPatientIdSet(Optional.ofNullable(mergeResult.heterozygousMask).orElse(VariantMask.emptyInstance()), Arrays.asList(mergedVariantStore.getPatientIds())); + log.info("Merged patient ids: " + Joiner.on(",").join(mergedPatientIds)); + } mergedMasks.put(entry.getKey(), mergeResult); } // Any entry in the second set that is not in the merged set can be merged with an empty variant mask, @@ -285,8 +299,8 @@ public FileBackedJsonIndexStorage maskMap = merged.get(key); maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); - VariantMask union = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()).union(Optional.ofNullable(variableVariantMasks.homozygousMask).orElse(VariantMask.emptyInstance())); - Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(union, Arrays.asList(mergedVariantStore.getPatientIds())); + VariantMask heteroMask = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()); + Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(heteroMask, Arrays.asList(mergedVariantStore.getPatientIds())); log.info("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); }); From 2194943905216ade88202ea96561b83624427ff4 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Sun, 24 Mar 2024 22:40:49 -0400 Subject: [PATCH 25/35] Fix converting bitmask to patient id bug --- .../data/genotype/VariableVariantMasks.java | 8 +++---- .../genotype/VariableVariantMasksTest.java | 22 +++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 3133caf2..19c6107e 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -271,10 +271,10 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas public static Set patientMaskToPatientIdSet(VariantMask patientMask, List patientIds) { if (patientMask instanceof VariantMaskBitmaskImpl) { Set ids = new HashSet<>(); - String bitmaskString = ((VariantMaskBitmaskImpl) patientMask).getBitmask().toString(2); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = patientIds.get(x-2).trim(); + BigInteger bigInteger = ((VariantMaskBitmaskImpl) patientMask).getBitmask(); + for(int x = 0;x < bigInteger.bitLength()-4;x++) { + if(patientMask.testBit(x)) { + String patientId = patientIds.get(x).trim(); ids.add(Integer.parseInt(patientId)); } } diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java new file mode 100644 index 00000000..6b0c0011 --- /dev/null +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java @@ -0,0 +1,22 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import org.junit.jupiter.api.Test; + +import java.math.BigInteger; + +import static org.junit.jupiter.api.Assertions.*; + +class VariableVariantMasksTest { + + @Test + void appendMask() { + BigInteger bitmask1 = new BigInteger("141128266537977664750676276621047930360647583370951385898970922432171641921591683601425661017874251620531446414001258481539117879583763813143907296662202568713257464719895142475222938754588184582833582945368733841208297077207159012919514593576142503835920156756519924916168242089239884330492690734761017788145472426679068150988914755013607752960816306592042734726724967292489382922774340753414934719473579642086577795740766564256929075326123036793589965882761953873677505969126336030099232419417531157749845223962002657158936982730111297974575206081858525359109608528262092083167977823315667519770531233235502931655993522963196732342798609807510543882273123542784003837220002062547832711681922161218010724226003150184777238854967623299970205873235909960398972219305810619458430315886318451687126866584390511424196256376003247717440484524668930504599413091194844245946142655802347109092922312106655798606759916931458730259588797621736772069847398957694392864416289605234581063232645315000563794292589544477569850783743632035729362609737922579552116678396740867191143283379987783383546964540333832022555110531730782339840166704631278663393147496643156685747417468473443887224102514355150110201463928311558935840684668441340082266117306282514075566574248767903320202249460451453420775463375086967869642395460246215345716051454824012618762098616749770441010729935100690497726813080659572063353154599964806728241328892738034442472863741591198849392657539785115946383059858121434147389809099727185407701653069953116828987459279642700621389676861517273949473071786975738343769782464350475716221573322857987917669867491560145456280976454077958400126376423873081869441464609193216436634914151839730185428390533788824490041131307997850622739795832813220163889874229711053167483400593799762457890098932032534121358716659875839129363445294836150352418351095797845291006220927734160887236196809213101473795"); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + BigInteger bitmask2 = new BigInteger("30986998537042903080350519548408047200882037374471443267447678987916097706689940814664693588471382927433911322922595655683"); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask2); + BigInteger expected = new BigInteger("364428449062309565827240887371711117987574656019257508470944944605332301145049360947562613021825062352285550919581217473685332023988542714833984833584283993391279871691386067405572061472644828810015780133411925154335939452908415258964249206427971383684062861395477403040282307619178797392994855011010859741065713642937722555386553472167057412349499978682577859360114784650082282703011601887907648540462571623667509259030122346885116531561265495702316349933315144109402391832141090339015610768872781857606433548246400953322494469603219947770538558126916689855755477351331412849701993441724311536424509298383050084506206211767917732037267909332303742403672014600658318302593423194411973298402659688783532220840192809967726287753517722498547930875524876017589557370530361981799944335286739781131150443454009586308444310074980028948209472950776329093810066130759876455221806685349198404676070598652587630369814761497643023695920944554335463560556407863991874650393356359742532476840044738380631950001000889884841365536456755318891308758448828432322546827241786950989285280451703830359048521243704376576346418597615317052525572950500220422542100551941365526524572447973251506942320379483402120949518538372004537133062107782710822000749441206410146350956094013529342546401955361956752745578930129151472060056501664176391306323733630924234020572479814310832686356211870629168524382442619274880885230138170761194502147197237097179031772802871639774718532255582374871483153552524216278948609829889298505566944209429410943844015411379772232511986448333683276938151761604643176820833900087800015896793760190738573447504231241253241247411492029282990960833205840126804844971397564707229362865231720213379844987578324282466174232636950775929742330163059024377711272017786328444659381735736216404131832568759328992585474109193095809959345744181054394252160618715444166335074662636279289524790651946664188322635565248044921668500523867959008028963420119948417112017996816651287475428019005246560397815063629879127215220996440067"); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 0, 0); + + assertEquals(expected, ((VariantMaskBitmaskImpl)merged).getBitmask()); + } +} \ No newline at end of file From b0699f182580a38cecdd2d063d61b40995db0d85 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 25 Mar 2024 09:42:21 -0400 Subject: [PATCH 26/35] Cleanup logging --- .../etl/genotype/GenomicDatasetMerger.java | 40 +++++-------------- 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index e9bc26e2..f78e5450 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -295,37 +295,17 @@ public FileBackedJsonIndexStorage { - ConcurrentHashMap maskMap = merged.get(key); - maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { - VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); - VariantMask heteroMask = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()); - Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(heteroMask, Arrays.asList(mergedVariantStore.getPatientIds())); - log.info("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); + if (log.isDebugEnabled()) { + merged.keys().stream().sorted().limit(3).forEach(key -> { + ConcurrentHashMap maskMap = merged.get(key); + maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { + VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); + VariantMask heteroMask = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()); + Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(heteroMask, Arrays.asList(mergedVariantStore.getPatientIds())); + log.debug("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); + }); }); - - }); + } return merged; } - - /** - * Appends one mask to another. This assumes the masks are both padded with '11' on each end - * to prevent overflow issues. - */ - /*public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { - if (mask1 == null && mask2 == null) { - return null; - } - if (mask1 == null) { - mask1 = variantStore1.emptyBitmask(); - } - if (mask2 == null) { - mask2 = variantStore2.emptyBitmask(); - } - String binaryMask1 = mask1.toString(2); - String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2); - return new BigInteger(appendedString, 2); - }*/ } From a9afcd466d8ace469034dfe28b55b7d0e72dbc1c Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 25 Mar 2024 13:26:48 -0400 Subject: [PATCH 27/35] ALS-5981: Fix off by one error in sparse mask creation --- .../avillach/hpds/data/genotype/VariableVariantMasks.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 19c6107e..bd096091 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -94,8 +94,9 @@ private VariantMask variantMaskFromRawString(String maskStringRaw) { variantMask = new VariantMaskBitmaskImpl(bitmask); } else { Set patientIndexes = new HashSet<>(); - for(int i = 2; i < bitmask.bitLength() - 2; i++) { - if (bitmask.testBit(i)) { + for(int i = 0; i < bitmask.bitLength() - 4; i++) { + // i + 2 because the mask is padded with 2 bits on each end + if (bitmask.testBit(i + 2)) { patientIndexes.add(i); } } From 643c26f069b3d85136524bb1a523d7b5cb386477 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 25 Mar 2024 13:41:05 -0400 Subject: [PATCH 28/35] Cleanup unused fields --- .../hpds/data/genotype/VariableVariantMasks.java | 14 ++------------ .../hpds/etl/genotype/GenomicDatasetMerger.java | 2 +- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index bd096091..563df590 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -1,6 +1,5 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; -import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import org.slf4j.Logger; @@ -108,10 +107,6 @@ private VariantMask variantMaskFromRawString(String maskStringRaw) { public VariableVariantMasks() { } - public VariableVariantMasks(int length) { - this.length = length; - } - @JsonProperty("ho") @JsonInclude(JsonInclude.Include.NON_NULL) public VariantMask homozygousMask; @@ -128,22 +123,17 @@ public VariableVariantMasks(int length) { @JsonInclude(JsonInclude.Include.NON_NULL) public VariantMask heterozygousNoCallMask; - @JsonIgnore - @JsonProperty("l") - public int length; - - @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; VariableVariantMasks that = (VariableVariantMasks) o; - return length == that.length && Objects.equals(homozygousMask, that.homozygousMask) && Objects.equals(heterozygousMask, that.heterozygousMask) && Objects.equals(homozygousNoCallMask, that.homozygousNoCallMask) && Objects.equals(heterozygousNoCallMask, that.heterozygousNoCallMask); + return Objects.equals(homozygousMask, that.homozygousMask) && Objects.equals(heterozygousMask, that.heterozygousMask) && Objects.equals(homozygousNoCallMask, that.homozygousNoCallMask) && Objects.equals(heterozygousNoCallMask, that.heterozygousNoCallMask); } @Override public int hashCode() { - return Objects.hash(homozygousMask, heterozygousMask, homozygousNoCallMask, heterozygousNoCallMask, length); + return Objects.hash(homozygousMask, heterozygousMask, homozygousNoCallMask, heterozygousNoCallMask); } public static BigInteger emptyBitmask(int length) { diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index f78e5450..90549194 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -242,7 +242,7 @@ public FileBackedJsonIndexStorage Date: Mon, 25 Mar 2024 15:12:12 -0400 Subject: [PATCH 29/35] Remove logging --- .../dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 90549194..5ae18bdf 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -295,7 +295,7 @@ public FileBackedJsonIndexStorage { ConcurrentHashMap maskMap = merged.get(key); maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { @@ -305,7 +305,7 @@ public FileBackedJsonIndexStorage Date: Wed, 27 Mar 2024 10:37:21 -0400 Subject: [PATCH 30/35] Add tests, fix bug found adding tests --- .../data/genotype/VariableVariantMasks.java | 56 +++++--------- .../hpds/data/genotype/VariantMask.java | 3 + .../data/genotype/VariantMaskBitmaskImpl.java | 15 ++++ .../data/genotype/VariantMaskSparseImpl.java | 10 +++ .../genotype/VariableVariantMasksTest.java | 77 ++++++++++++++++++- .../etl/genotype/GenomicDatasetMerger.java | 6 +- .../hpds/etl/genotype/NewVCFLoader.java | 12 +-- 7 files changed, 132 insertions(+), 47 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 563df590..6afb62f1 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -150,15 +150,6 @@ public static BigInteger emptyBitmask(int length) { return emptyBitmask; } - /*public VariableVariantMasks append(VariableVariantMasks variantMasks) { - VariableVariantMasks appendedMasks = new VariableVariantMasks(); - appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask, this.length, variantMasks.length); - appendedMasks.heterozygousMask = appendMask(this.heterozygousMask, variantMasks.heterozygousMask, this.length, variantMasks.length); - appendedMasks.homozygousNoCallMask = appendMask(this.homozygousNoCallMask, variantMasks.homozygousNoCallMask, this.length, variantMasks.length); - appendedMasks.heterozygousNoCallMask = appendMask(this.heterozygousNoCallMask, variantMasks.heterozygousNoCallMask, this.length, variantMasks.length); - return appendedMasks; - }*/ - public static VariableVariantMasks append(VariableVariantMasks masks1, int length1, VariableVariantMasks masks2, int length2) { VariableVariantMasks appendedMasks = new VariableVariantMasks(); appendedMasks.homozygousMask = appendMask(masks1.homozygousMask, masks2.homozygousMask, length1, length2); @@ -204,11 +195,16 @@ else if (variantMask1 instanceof VariantMaskBitmaskImpl) { private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { BigInteger mask1 = emptyBitmask(length1); - for (Integer patientId : variantMask1.patientIndexes) { - mask1 = mask1.setBit(patientId); + for (Integer patientIndex : variantMask1.patientIndexes) { + mask1 = mask1.setBit(patientIndex + 2); } String binaryMask1 = mask1.toString(2); + String binaryMask2 = variantMask2.bitmask.toString(2); + if (binaryMask2.length() - 4 != length2) { + throw new IllegalArgumentException("Bitmask does not match length (" + length2 + "): " + variantMask2.bitmask); + } + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + binaryMask1.substring(2); return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); @@ -216,10 +212,13 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { String binaryMask1 = variantMask1.bitmask.toString(2); + if (binaryMask1.length() - 4 != length1) { + throw new IllegalArgumentException("Bitmask does not match length (" + length1 + "): " + variantMask1.bitmask); + } BigInteger mask2 = emptyBitmask(length2); for (Integer patientId : variantMask2.patientIndexes) { - mask2 = mask2.setBit(patientId); + mask2 = mask2.setBit(patientId + 2); } String binaryMask2 = mask2.toString(2); @@ -232,9 +231,17 @@ private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMa String binaryMask1 = variantMask1.bitmask.toString(2); String binaryMask2 = variantMask2.bitmask.toString(2); + if (binaryMask1.length() - 4 != length1) { + throw new IllegalArgumentException("Bitmask does not match length (" + length1 + "): " + variantMask1.bitmask); + } + if (binaryMask2.length() - 4 != length2) { + throw new IllegalArgumentException("Bitmask does not match length (" + length2 + "): " + variantMask2.bitmask); + } + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + binaryMask1.substring(2); - return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + BigInteger bitmask = new BigInteger(appendedString, 2); + return new VariantMaskBitmaskImpl(bitmask); } private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { @@ -259,27 +266,4 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas } } - public static Set patientMaskToPatientIdSet(VariantMask patientMask, List patientIds) { - if (patientMask instanceof VariantMaskBitmaskImpl) { - Set ids = new HashSet<>(); - BigInteger bigInteger = ((VariantMaskBitmaskImpl) patientMask).getBitmask(); - for(int x = 0;x < bigInteger.bitLength()-4;x++) { - if(patientMask.testBit(x)) { - String patientId = patientIds.get(x).trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; - } else if (patientMask instanceof VariantMaskSparseImpl) { - return ((VariantMaskSparseImpl) patientMask).getPatientIndexes().stream() - .map(patientIds::get) - .map(String::trim) - .map(Integer::parseInt) - .collect(Collectors.toSet()); - } - throw new IllegalArgumentException("Unknown VariantMask implementation"); - } - - - } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java index c170dce6..86d06472 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java @@ -4,6 +4,7 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; import java.math.BigInteger; +import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -30,4 +31,6 @@ public interface VariantMask { static VariantMask emptyInstance() { return new VariantMaskSparseImpl(Set.of()); } + + Set patientMaskToPatientIdSet(List patientIds); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java index 316f9e9c..3df430bc 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -6,7 +6,10 @@ import com.fasterxml.jackson.databind.ser.std.ToStringSerializer; import java.math.BigInteger; +import java.util.HashSet; +import java.util.List; import java.util.Objects; +import java.util.Set; public class VariantMaskBitmaskImpl implements VariantMask { @@ -60,6 +63,18 @@ public int bitCount() { return bitmask.bitCount(); } + @Override + public Set patientMaskToPatientIdSet(List patientIds) { + Set ids = new HashSet<>(); + for(int x = 0;x < bitmask.bitLength()-4;x++) { + if(testBit(x + 2)) { + String patientId = patientIds.get(x).trim(); + ids.add(Integer.parseInt(patientId)); + } + } + return ids; + } + private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) { return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.or(this.bitmask)); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java index 0b72c63a..fb5e2a6a 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -4,6 +4,7 @@ import java.math.BigInteger; import java.util.HashSet; +import java.util.List; import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; @@ -49,6 +50,15 @@ public int bitCount() { return patientIndexes.size(); } + @Override + public Set patientMaskToPatientIdSet(List patientIds) { + return patientIndexes.stream() + .map(patientIds::get) + .map(String::trim) + .map(Integer::parseInt) + .collect(Collectors.toSet()); + } + private VariantMask union(VariantMaskSparseImpl variantMask) { HashSet union = new HashSet<>(variantMask.patientIndexes); union.addAll(this.patientIndexes); diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java index 6b0c0011..6d00f538 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java @@ -3,20 +3,93 @@ import org.junit.jupiter.api.Test; import java.math.BigInteger; +import java.util.Set; import static org.junit.jupiter.api.Assertions.*; class VariableVariantMasksTest { @Test - void appendMask() { + void appendMask_bitmaskImpl() { BigInteger bitmask1 = new BigInteger("141128266537977664750676276621047930360647583370951385898970922432171641921591683601425661017874251620531446414001258481539117879583763813143907296662202568713257464719895142475222938754588184582833582945368733841208297077207159012919514593576142503835920156756519924916168242089239884330492690734761017788145472426679068150988914755013607752960816306592042734726724967292489382922774340753414934719473579642086577795740766564256929075326123036793589965882761953873677505969126336030099232419417531157749845223962002657158936982730111297974575206081858525359109608528262092083167977823315667519770531233235502931655993522963196732342798609807510543882273123542784003837220002062547832711681922161218010724226003150184777238854967623299970205873235909960398972219305810619458430315886318451687126866584390511424196256376003247717440484524668930504599413091194844245946142655802347109092922312106655798606759916931458730259588797621736772069847398957694392864416289605234581063232645315000563794292589544477569850783743632035729362609737922579552116678396740867191143283379987783383546964540333832022555110531730782339840166704631278663393147496643156685747417468473443887224102514355150110201463928311558935840684668441340082266117306282514075566574248767903320202249460451453420775463375086967869642395460246215345716051454824012618762098616749770441010729935100690497726813080659572063353154599964806728241328892738034442472863741591198849392657539785115946383059858121434147389809099727185407701653069953116828987459279642700621389676861517273949473071786975738343769782464350475716221573322857987917669867491560145456280976454077958400126376423873081869441464609193216436634914151839730185428390533788824490041131307997850622739795832813220163889874229711053167483400593799762457890098932032534121358716659875839129363445294836150352418351095797845291006220927734160887236196809213101473795"); VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); BigInteger bitmask2 = new BigInteger("30986998537042903080350519548408047200882037374471443267447678987916097706689940814664693588471382927433911322922595655683"); VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask2); BigInteger expected = new BigInteger("364428449062309565827240887371711117987574656019257508470944944605332301145049360947562613021825062352285550919581217473685332023988542714833984833584283993391279871691386067405572061472644828810015780133411925154335939452908415258964249206427971383684062861395477403040282307619178797392994855011010859741065713642937722555386553472167057412349499978682577859360114784650082282703011601887907648540462571623667509259030122346885116531561265495702316349933315144109402391832141090339015610768872781857606433548246400953322494469603219947770538558126916689855755477351331412849701993441724311536424509298383050084506206211767917732037267909332303742403672014600658318302593423194411973298402659688783532220840192809967726287753517722498547930875524876017589557370530361981799944335286739781131150443454009586308444310074980028948209472950776329093810066130759876455221806685349198404676070598652587630369814761497643023695920944554335463560556407863991874650393356359742532476840044738380631950001000889884841365536456755318891308758448828432322546827241786950989285280451703830359048521243704376576346418597615317052525572950500220422542100551941365526524572447973251506942320379483402120949518538372004537133062107782710822000749441206410146350956094013529342546401955361956752745578930129151472060056501664176391306323733630924234020572479814310832686356211870629168524382442619274880885230138170761194502147197237097179031772802871639774718532255582374871483153552524216278948609829889298505566944209429410943844015411379772232511986448333683276938151761604643176820833900087800015896793760190738573447504231241253241247411492029282990960833205840126804844971397564707229362865231720213379844987578324282466174232636950775929742330163059024377711272017786328444659381735736216404131832568759328992585474109193095809959345744181054394252160618715444166335074662636279289524790651946664188322635565248044921668500523867959008028963420119948417112017996816651287475428019005246560397815063629879127215220996440067"); - VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 0, 0); + // length should equal the length of the binary representation - 4. Bitmasks are padded with 11 on each end + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 6282, 400); assertEquals(expected, ((VariantMaskBitmaskImpl)merged).getBitmask()); } + @Test + void appendMask_bitmaskImpl_invalidLength() { + BigInteger bitmask1 = new BigInteger("141128266537977664750676276621047930360647583370951385898970922432171641921591683601425661017874251620531446414001258481539117879583763813143907296662202568713257464719895142475222938754588184582833582945368733841208297077207159012919514593576142503835920156756519924916168242089239884330492690734761017788145472426679068150988914755013607752960816306592042734726724967292489382922774340753414934719473579642086577795740766564256929075326123036793589965882761953873677505969126336030099232419417531157749845223962002657158936982730111297974575206081858525359109608528262092083167977823315667519770531233235502931655993522963196732342798609807510543882273123542784003837220002062547832711681922161218010724226003150184777238854967623299970205873235909960398972219305810619458430315886318451687126866584390511424196256376003247717440484524668930504599413091194844245946142655802347109092922312106655798606759916931458730259588797621736772069847398957694392864416289605234581063232645315000563794292589544477569850783743632035729362609737922579552116678396740867191143283379987783383546964540333832022555110531730782339840166704631278663393147496643156685747417468473443887224102514355150110201463928311558935840684668441340082266117306282514075566574248767903320202249460451453420775463375086967869642395460246215345716051454824012618762098616749770441010729935100690497726813080659572063353154599964806728241328892738034442472863741591198849392657539785115946383059858121434147389809099727185407701653069953116828987459279642700621389676861517273949473071786975738343769782464350475716221573322857987917669867491560145456280976454077958400126376423873081869441464609193216436634914151839730185428390533788824490041131307997850622739795832813220163889874229711053167483400593799762457890098932032534121358716659875839129363445294836150352418351095797845291006220927734160887236196809213101473795"); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + BigInteger bitmask2 = new BigInteger("30986998537042903080350519548408047200882037374471443267447678987916097706689940814664693588471382927433911322922595655683"); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask2); + BigInteger expected = new BigInteger("364428449062309565827240887371711117987574656019257508470944944605332301145049360947562613021825062352285550919581217473685332023988542714833984833584283993391279871691386067405572061472644828810015780133411925154335939452908415258964249206427971383684062861395477403040282307619178797392994855011010859741065713642937722555386553472167057412349499978682577859360114784650082282703011601887907648540462571623667509259030122346885116531561265495702316349933315144109402391832141090339015610768872781857606433548246400953322494469603219947770538558126916689855755477351331412849701993441724311536424509298383050084506206211767917732037267909332303742403672014600658318302593423194411973298402659688783532220840192809967726287753517722498547930875524876017589557370530361981799944335286739781131150443454009586308444310074980028948209472950776329093810066130759876455221806685349198404676070598652587630369814761497643023695920944554335463560556407863991874650393356359742532476840044738380631950001000889884841365536456755318891308758448828432322546827241786950989285280451703830359048521243704376576346418597615317052525572950500220422542100551941365526524572447973251506942320379483402120949518538372004537133062107782710822000749441206410146350956094013529342546401955361956752745578930129151472060056501664176391306323733630924234020572479814310832686356211870629168524382442619274880885230138170761194502147197237097179031772802871639774718532255582374871483153552524216278948609829889298505566944209429410943844015411379772232511986448333683276938151761604643176820833900087800015896793760190738573447504231241253241247411492029282990960833205840126804844971397564707229362865231720213379844987578324282466174232636950775929742330163059024377711272017786328444659381735736216404131832568759328992585474109193095809959345744181054394252160618715444166335074662636279289524790651946664188322635565248044921668500523867959008028963420119948417112017996816651287475428019005246560397815063629879127215220996440067"); + // length should equal the length of the binary representation - 4. Bitmasks are padded with 11 on each end + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 6282, 400); + + assertThrows( + IllegalArgumentException.class, + () -> VariableVariantMasks.appendMask(mask1, mask2, 6283, 400) + ); + assertThrows( + IllegalArgumentException.class, + () -> VariableVariantMasks.appendMask(mask1, mask2, 6282, 399) + ); + } + + @Test + void appendMask_sparseImpl() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 99)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + VariantMask expected = new VariantMaskSparseImpl(Set.of(1, 10, 99, 101, 199)); + assertEquals(expected, merged); + } + @Test + void appendMask_sparseImplOneEmpty() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of()); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 99)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + VariantMask expected = new VariantMaskSparseImpl(Set.of(101, 199)); + assertEquals(expected, merged); + + mask1 = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + mask2 = new VariantMaskSparseImpl(Set.of()); + merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + expected = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + assertEquals(expected, merged); + } + + @Test + void appendMask_sparseBitmask() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(1, 9)); + BigInteger bitmask1 = new BigInteger("11100000010011", 2); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask1); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 10, 10); + + BigInteger bitmask = new BigInteger("111000000100" + "100000001011", 2); + VariantMask expected = new VariantMaskBitmaskImpl(bitmask); + System.out.println(((VariantMaskBitmaskImpl) merged).getBitmask().toString(2)); + assertEquals(expected, merged); + } + + @Test + void appendMask_bitmaskSparse() { + BigInteger bitmask1 = new BigInteger("11100000010011", 2); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 9)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 10, 10); + + BigInteger bitmask = new BigInteger("111000000010" + "100000010011", 2); + VariantMask expected = new VariantMaskBitmaskImpl(bitmask); + System.out.println(((VariantMaskBitmaskImpl) merged).getBitmask().toString(2)); + assertEquals(expected, merged); + } } \ No newline at end of file diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 5ae18bdf..00577e90 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -295,17 +295,17 @@ public FileBackedJsonIndexStorage { ConcurrentHashMap maskMap = merged.get(key); maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); VariantMask heteroMask = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()); Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(heteroMask, Arrays.asList(mergedVariantStore.getPatientIds())); - log.debug("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); + log.trace("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); }); }); - }*/ + } return merged; } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 996c4078..1aed473e 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -233,22 +233,22 @@ private static void loadVCFs(File indexFile) throws IOException { } private static String sampleIdsForMask(String[] sampleIds, VariantMask variantMask) { - String idList = ""; + StringBuilder idList = new StringBuilder(); if (variantMask != null) { if (variantMask instanceof VariantMaskBitmaskImpl) { BigInteger mask = ((VariantMaskBitmaskImpl) variantMask).getBitmask(); - for (int x = 2; x < mask.bitLength() - 2; x++) { - if (mask.testBit(mask.bitLength() - 1 - x)) { - idList += sampleIds[x - 2] + ","; + for (int x = 0; x < mask.bitLength() - 4; x++) { + if (variantMask.testBit(x)) { + idList.append(sampleIds[x]).append(","); } } } else if (variantMask instanceof VariantMaskSparseImpl) { for (Integer patientIndex : ((VariantMaskSparseImpl) variantMask).getPatientIndexes()) { - idList += sampleIds[patientIndex] + ","; + idList.append(sampleIds[patientIndex]).append(","); } } } - return idList; + return idList.toString(); } private static void flipChunk(String lastContigProcessed, int lastChunkProcessed, int currentChunk, From acec7bca374cdc207203cc1f8c6502233426c47f Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 27 Mar 2024 11:02:34 -0400 Subject: [PATCH 31/35] Fix tests, compilation issue --- .../hpds/etl/genotype/GenomicDatasetMerger.java | 10 +++++----- .../hpds/processing/GenomicProcessorNodeImpl.java | 2 +- .../processing/GenomicProcessorParentImpl.java | 2 +- .../GenomicProcessorPatientMergingParentImpl.java | 2 +- ...nomicProcessorPatientMergingParentImplTest.java | 6 +++--- .../processing/PatientVariantJoinHandlerTest.java | 14 +++++++------- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 00577e90..760516cb 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -93,7 +93,7 @@ public Map mergeVariantIndexes() throws String[] variantIndex2 = variantStore2.getVariantSpecIndex(); Map variantSpecToIndexMap = new HashMap<>(); - LinkedList variantSpecList = new LinkedList<>(Arrays.asList(variantIndex1)); + LinkedList variantSpecList = new LinkedList<>(List.of(variantIndex1)); for (int i = 0; i < variantIndex1.length; i++) { variantSpecToIndexMap.put(variantIndex1[i], i); } @@ -251,13 +251,13 @@ public FileBackedJsonIndexStorage patientIds1 = VariableVariantMasks.patientMaskToPatientIdSet(Optional.ofNullable(entry.getValue().heterozygousMask).orElse(VariantMask.emptyInstance()), Arrays.asList(variantStore1.getPatientIds())); + Set patientIds1 = Optional.ofNullable(entry.getValue().heterozygousMask).orElse(VariantMask.emptyInstance()).patientMaskToPatientIdSet(List.of(variantStore1.getPatientIds())); log.info("Patients 1: " + Joiner.on(",").join(patientIds1)); - Set patientIds2 = VariableVariantMasks.patientMaskToPatientIdSet(Optional.ofNullable(variantMasks2.heterozygousMask).orElse(VariantMask.emptyInstance()), Arrays.asList(variantStore2.getPatientIds())); + Set patientIds2 = Optional.ofNullable(variantMasks2.heterozygousMask).orElse(VariantMask.emptyInstance()).patientMaskToPatientIdSet(List.of(variantStore2.getPatientIds())); log.info("Patients 2: " + Joiner.on(",").join(patientIds2)); - Set mergedPatientIds = VariableVariantMasks.patientMaskToPatientIdSet(Optional.ofNullable(mergeResult.heterozygousMask).orElse(VariantMask.emptyInstance()), Arrays.asList(mergedVariantStore.getPatientIds())); + Set mergedPatientIds = Optional.ofNullable(mergeResult.heterozygousMask).orElse(VariantMask.emptyInstance()).patientMaskToPatientIdSet(List.of(mergedVariantStore.getPatientIds())); log.info("Merged patient ids: " + Joiner.on(",").join(mergedPatientIds)); } mergedMasks.put(entry.getKey(), mergeResult); @@ -301,7 +301,7 @@ public FileBackedJsonIndexStorage { VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); VariantMask heteroMask = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()); - Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(heteroMask, Arrays.asList(mergedVariantStore.getPatientIds())); + Set patientsWithVariant = heteroMask.patientMaskToPatientIdSet(List.of(mergedVariantStore.getPatientIds())); log.trace("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); }); }); diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java index 975e993b..a030fde0 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java @@ -139,7 +139,7 @@ public VariantMask runGetPatientMask(DistributableQuery distributableQuery) { @Override public Set patientMaskToPatientIdSet(VariantMask patientMask) { - return VariableVariantMasks.patientMaskToPatientIdSet(patientMask, getPatientIds()); + return patientMask.patientMaskToPatientIdSet(getPatientIds()); } private List getVariantsMatchingFilters(Query.VariantInfoFilter filter) { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java index 28226df7..ea9ea853 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java @@ -56,7 +56,7 @@ public Mono getPatientMask(DistributableQuery distributableQuery) { @Override public Set patientMaskToPatientIdSet(VariantMask patientMask) { - return VariableVariantMasks.patientMaskToPatientIdSet(patientMask, getPatientIds()); + return patientMask.patientMaskToPatientIdSet(getPatientIds()); } @Override diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java index c13d7ef8..253ee874 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java @@ -84,7 +84,7 @@ public SizedVariantMask appendMask(SizedVariantMask mask1, SizedVariantMask mask @Override public Set patientMaskToPatientIdSet(VariantMask patientMask) { - return VariableVariantMasks.patientMaskToPatientIdSet(patientMask, getPatientIds()); + return patientMask.patientMaskToPatientIdSet(getPatientIds()); } @Override diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java index 7a64d718..fdd747cb 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java @@ -43,7 +43,7 @@ public void getPatientMask_validResponses_returnMerged() { when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110001100011", 2)))); when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11011000011000000111", 2)); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11000100011000011011", 2)); assertEquals(expectedPatientMask, patientMask); } @@ -54,7 +54,7 @@ public void getPatientMask_noPatientResponses_returnMerged() { when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110000000011", 2)))); when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000011", 2)))); VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11011000000000000011", 2)); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11000000000000011011", 2)); assertEquals(expectedPatientMask, patientMask); } @@ -65,7 +65,7 @@ public void getPatientMask_emptyResponses_returnMerged() { when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("1111", 2)))); when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("110110000111", 2)); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("110001011011", 2)); assertEquals(expectedPatientMask, patientMask); } diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java index 7b1b745c..1ab10a7b 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java @@ -29,7 +29,7 @@ public class PatientVariantJoinHandlerTest { public static final String[] PATIENT_IDS = {"101", "102", "103", "104", "105", "106", "107", "108"}; public static final Set PATIENT_IDS_INTEGERS = Set.of(PATIENT_IDS).stream().map(Integer::parseInt).collect(Collectors.toSet()); - public static final String[] VARIANT_INDEX = {"16,61642243,A,T", "16,61642252,A,G", "16,61642256,C,T", "16,61642257,G,A", "16,61642258,G,A", "16,61642259,G,A", "16,61642260,G,A", "16,61642261,G,A"}; + public static final String[] VARIANT_INDEX = {"16,61642243,A,T,ABC,consequence1", "16,61642252,A,G,ABC,consequence1", "16,61642256,C,T,ABC,consequence2", "16,61642257,G,A,ABC,consequence3", "16,61642258,G,A,ABC,consequence3", "16,61642259,G,A,ABC,consequence1", "16,61642260,G,A,ABC,consequence1", "16,61642261,G,A,ABC,consequence1"}; public PatientVariantJoinHandlerTest(@Mock VariantService variantService) { this.variantService = variantService; @@ -54,7 +54,7 @@ public void getPatientIdsForIntersectionOfVariantSets_allPatientsMatchOneVariant when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.of(emptyVariantMasks)); - Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be all patients, as all patients match one of the variants assertEquals(PATIENT_IDS_INTEGERS, patientIdsForIntersectionOfVariantSets); } @@ -76,7 +76,7 @@ public void getPatientIdsForIntersectionOfVariantSets_allPatientsMatchOneVariant when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.empty()); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.empty()); - Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be all patients, as all patients match one of the variants assertEquals(PATIENT_IDS_INTEGERS, patientIdsForIntersectionOfVariantSets); } @@ -94,7 +94,7 @@ public void getPatientIdsForIntersectionOfVariantSets_noPatientsMatchVariants() when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.of(emptyVariantMasks)); - Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(), intersectionOfInfoFilters), List.of(PATIENT_IDS)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(), intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be empty because all variants masks have no matching patients assertEquals(Set.of(), patientIdsForIntersectionOfVariantSets); } @@ -114,7 +114,7 @@ public void getPatientIdsForIntersectionOfVariantSets_somePatientsMatchVariants( when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(variantMasks2)); - Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be all patients who match at least one variant assertEquals(Set.of(101, 103, 105), patientIdsForIntersectionOfVariantSets); } @@ -124,7 +124,7 @@ public void getPatientIdsForIntersectionOfVariantSets_noVariants() { VariantIndex intersectionOfInfoFilters = VariantIndex.empty(); when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); - Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters), List.of(PATIENT_IDS)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be empty, as there are no variants assertEquals(Set.of(), patientIdsForIntersectionOfVariantSets); } @@ -144,7 +144,7 @@ public void getPatientIdsForIntersectionOfVariantSets_patientSubsetPassed() { when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(variantMasks2)); - Set patientIdsForIntersectionOfVariantSets = VariableVariantMasks.patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(102, 103, 104, 105, 106), intersectionOfInfoFilters), List.of(PATIENT_IDS)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(102, 103, 104, 105, 106), intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be the union of patients matching variants (101, 103, 105, 107), intersected with the patient subset parameter (103, 104, 105) which is (103, 105) assertEquals(Set.of(103, 105), patientIdsForIntersectionOfVariantSets); } From 02a8506c7f5bbc8494f6c8ae52dd83a876368881 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 27 Mar 2024 14:16:35 -0400 Subject: [PATCH 32/35] Fix tests --- .../hpds/data/genotype/VariantMaskBitmaskImpl.java | 2 +- .../hpds/processing/PatientVariantJoinHandler.java | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java index 3df430bc..6195abde 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -67,7 +67,7 @@ public int bitCount() { public Set patientMaskToPatientIdSet(List patientIds) { Set ids = new HashSet<>(); for(int x = 0;x < bitmask.bitLength()-4;x++) { - if(testBit(x + 2)) { + if(testBit(x)) { String patientId = patientIds.get(x).trim(); ids.add(Integer.parseInt(patientId)); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java index 5c504b4d..e5c824fb 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java @@ -112,8 +112,9 @@ public VariantMask getPatientIdsForIntersectionOfVariantSets(Set patien // todo: return VariantMask public BigInteger createMaskForPatientSet(Set patientSubset) { StringBuilder builder = new StringBuilder("11"); //variant bitmasks are bookended with '11' - for(String patientId : variantService.getPatientIds()) { - Integer idInt = Integer.parseInt(patientId); + + for (int i = variantService.getPatientIds().length - 1; i >= 0; i--) { + Integer idInt = Integer.parseInt(variantService.getPatientIds()[i]); if(patientSubset.contains(idInt)){ builder.append("1"); } else { @@ -122,8 +123,6 @@ public BigInteger createMaskForPatientSet(Set patientSubset) { } builder.append("11"); // masks are bookended with '11' set this so we don't count those -// log.debug("PATIENT MASK: " + builder.toString()); - BigInteger patientMasks = new BigInteger(builder.toString(), 2); return patientMasks; } From 5325ac028423b8be2972ad2b2599897f1f0be56c Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 27 Mar 2024 15:01:35 -0400 Subject: [PATCH 33/35] Fix Genomic processor tests --- .../GenomicProcessorPatientMergingParentImplTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java index fdd747cb..193696fe 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java @@ -40,8 +40,11 @@ public void setup() { public void getPatientMask_validResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor1.getPatientIds()).thenReturn(List.of("1", "2", "3", "4")); when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110001100011", 2)))); + when(mockProcessor2.getPatientIds()).thenReturn(List.of("5", "6", "7", "8", "9", "10", "11", "12")); when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); + when(mockProcessor3.getPatientIds()).thenReturn(List.of("15", "16", "17", "18")); VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11000100011000011011", 2)); assertEquals(expectedPatientMask, patientMask); @@ -51,8 +54,11 @@ public void getPatientMask_validResponses_returnMerged() { public void getPatientMask_noPatientResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor1.getPatientIds()).thenReturn(List.of("1", "2", "3", "4")); when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110000000011", 2)))); + when(mockProcessor2.getPatientIds()).thenReturn(List.of("5", "6", "7", "8", "9", "10", "11", "12")); when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000011", 2)))); + when(mockProcessor3.getPatientIds()).thenReturn(List.of("15", "16", "17", "18")); VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11000000000000011011", 2)); assertEquals(expectedPatientMask, patientMask); @@ -62,8 +68,11 @@ public void getPatientMask_noPatientResponses_returnMerged() { public void getPatientMask_emptyResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor1.getPatientIds()).thenReturn(List.of("1", "2", "3", "4")); when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("1111", 2)))); + when(mockProcessor2.getPatientIds()).thenReturn(List.of()); when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); + when(mockProcessor3.getPatientIds()).thenReturn(List.of("5", "6", "7", "8")); VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("110001011011", 2)); assertEquals(expectedPatientMask, patientMask); From 887dad20f6fb383e7b186473849538362274ee15 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 27 Mar 2024 15:44:22 -0400 Subject: [PATCH 34/35] Fix bucket index by sample --- .../hpds/data/genotype/BucketIndexBySample.java | 11 +++++------ .../hpds/data/genotype/VariableVariantMasks.java | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 3b89a329..52b0da04 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -99,15 +99,14 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws // For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket. // todo: implement for variant explorer - /*int maxIndex = patientMaskForBucket[0].bitLength() - 1; int indexOfBucket = Collections.binarySearch(bucketList, bucketKey) + 2; //patientBucketCharMasks has bookend bits - for(int x = 2;x VariableVariantMasks.SPARSE_VARIANT_THRESHOLD) { variantMask = new VariantMaskBitmaskImpl(bitmask); } else { From 4bd5787abc779dd66a9f5ba93be05aea46f2dd28 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Tue, 2 Apr 2024 08:58:39 -0400 Subject: [PATCH 35/35] Remove extraneous logging, disable variant explorer related unit test --- .../hpds/etl/genotype/GenomicDatasetMerger.java | 14 -------------- .../data/genotype/VariantMetadataIndexTest.java | 3 +++ 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 760516cb..ae3a06e6 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -246,20 +246,6 @@ public FileBackedJsonIndexStorage patientIds1 = Optional.ofNullable(entry.getValue().heterozygousMask).orElse(VariantMask.emptyInstance()).patientMaskToPatientIdSet(List.of(variantStore1.getPatientIds())); - log.info("Patients 1: " + Joiner.on(",").join(patientIds1)); - - Set patientIds2 = Optional.ofNullable(variantMasks2.heterozygousMask).orElse(VariantMask.emptyInstance()).patientMaskToPatientIdSet(List.of(variantStore2.getPatientIds())); - log.info("Patients 2: " + Joiner.on(",").join(patientIds2)); - - Set mergedPatientIds = Optional.ofNullable(mergeResult.heterozygousMask).orElse(VariantMask.emptyInstance()).patientMaskToPatientIdSet(List.of(mergedVariantStore.getPatientIds())); - log.info("Merged patient ids: " + Joiner.on(",").join(mergedPatientIds)); - } mergedMasks.put(entry.getKey(), mergeResult); } // Any entry in the second set that is not in the merged set can be merged with an empty variant mask, diff --git a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java index df340f84..58e1d48a 100644 --- a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java +++ b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java @@ -9,6 +9,7 @@ import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.VariantMetadataLoader; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.test.context.event.annotation.BeforeTestClass; @@ -16,6 +17,8 @@ import static org.springframework.test.util.AssertionErrors.assertEquals; import static org.springframework.test.util.AssertionErrors.assertNotNull; +// todo: enable when variant explorer is implemented +@Disabled public class VariantMetadataIndexTest { //From file 1 14 19038291 rs550062154 C A