diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 341c4794..52b0da04 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -43,7 +43,7 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws //Create a bucketList, containing keys for all buckets in the variantStore for(String contig: contigSet){ - FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); + FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); if(contigStore != null && contigStore.keys() != null) { bucketList.addAll(contigStore.keys().stream().map( (Integer bucket)->{ @@ -74,7 +74,7 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws patientBucketCharMasks[x] = emptyBucketMaskChar(); } contigSet.parallelStream().forEach((contig)->{ - FileBackedByteIndexedStorage> contigStore = + FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); if(contigStore != null && contigStore.keys() != null) { contigStore.keys().stream().forEach( @@ -82,28 +82,29 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws String bucketKey = contig + ":" + bucket; // Create a bitmask with 1 values for each patient who has any variant in this bucket - BigInteger[] patientMaskForBucket = {variantStore.emptyBitmask()}; - contigStore.get(bucket).values().forEach((VariantMasks masks)->{ + VariantMask[] patientMaskForBucket = {new VariantMaskSparseImpl(Set.of())}; + contigStore.get(bucket).values().forEach((VariableVariantMasks masks)->{ if(masks.heterozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.heterozygousMask); } //add hetreo no call bits to mask if(masks.heterozygousNoCallMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousNoCallMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.heterozygousNoCallMask); } if(masks.homozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.homozygousMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.homozygousMask); } }); // For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket. - int maxIndex = patientMaskForBucket[0].bitLength() - 1; + + // todo: implement for variant explorer int indexOfBucket = Collections.binarySearch(bucketList, bucketKey) + 2; //patientBucketCharMasks has bookend bits - for(int x = 2;x emptyBitmaskMap = new ConcurrentHashMap<>(); + + public static int SPARSE_VARIANT_THRESHOLD = 5; + + /* + * These indices result from the char values of the 3 characters in a VCF sample + * record summed as integers % 7 + * + * This allows us to not actually do string comparisons, instead we add 3 values, + * do one modulus operation, then use the result as the index into the result array. + */ + + // ./0 = (46 + 47 + 48) % 7 = 1 + // 0|. = (48 + 124 + 46) % 7 = 1 + // .|0 = (46 + 124 + 48) % 7 = 1 + public static final int HETERO_NOCALL_REF_CHAR_INDEX = 1; + + // ./1 = (46 + 47 + 49) % 7 = 2 + // 1|. = (49 + 124 + 46) % 7 = 2 + // .|1 = (46 + 124 + 49) % 7 = 2 + // ./1 = (46 + 47 + 49) % 7 = 2 + // 1|. = (49 + 124 + 46) % 7 = 2 + // .|1 = (46 + 124 + 49) % 7 = 2 + public static final int HETERO_NOCALL_VARIANT_CHAR_INDEX = 2; + + // 0/0 = (48 + 47 + 48) % 7 = 3 + // 0|0 = (48 + 124 + 48) % 7 = 3 + public static final int ZERO_ZERO_CHAR_INDEX = 3; + + // 0/1 = (48 + 47 + 49) % 7 = 4 + // 1|0 = (49 + 124 + 48) % 7 = 4 + // 0|1 = (48 + 124 + 49) % 7 = 4 + public static final int ZERO_ONE_CHAR_INDEX = 4; + + // 1/1 = (49 + 47 + 49) % 7 = 5 + // 1|1 = (49 + 124 + 49) % 7 = 5 + public static final int ONE_ONE_CHAR_INDEX = 5; + + // ./. = (46 + 47 + 46) % 7 = 6 + // .|. = (46 + 124 + 46) % 7 = 6 + public static final int HOMO_NOCALL_CHAR_INDEX = 6; + + public VariableVariantMasks(char[][] maskValues) { + String heteroMaskStringRaw = new String(maskValues[ZERO_ONE_CHAR_INDEX]); + String homoMaskStringRaw = new String(maskValues[ONE_ONE_CHAR_INDEX]); + String heteroNoCallMaskStringRaw = new String(maskValues[HETERO_NOCALL_VARIANT_CHAR_INDEX]); + String homoNoCallMaskStringRaw = new String(maskValues[HOMO_NOCALL_CHAR_INDEX]); + + heterozygousMask = variantMaskFromRawString(heteroMaskStringRaw); + homozygousMask = variantMaskFromRawString(homoMaskStringRaw); + heterozygousNoCallMask = variantMaskFromRawString(heteroNoCallMaskStringRaw); + homozygousNoCallMask = variantMaskFromRawString(homoNoCallMaskStringRaw); + } + + private VariantMask variantMaskFromRawString(String maskStringRaw) { + if (!maskStringRaw.contains("1")) { + return null; + } + + VariantMask variantMask; + BigInteger bitmask = new BigInteger(oneone + new StringBuilder(maskStringRaw).reverse() + oneone, 2); + if (bitmask.bitCount() - 4 > VariableVariantMasks.SPARSE_VARIANT_THRESHOLD) { + variantMask = new VariantMaskBitmaskImpl(bitmask); + } else { + Set patientIndexes = new HashSet<>(); + for(int i = 0; i < bitmask.bitLength() - 4; i++) { + // i + 2 because the mask is padded with 2 bits on each end + if (bitmask.testBit(i + 2)) { + patientIndexes.add(i); + } + } + variantMask = new VariantMaskSparseImpl(patientIndexes); + } + return variantMask; + } + + public VariableVariantMasks() { + } + + @JsonProperty("ho") + @JsonInclude(JsonInclude.Include.NON_NULL) + public VariantMask homozygousMask; + + @JsonProperty("he") + @JsonInclude(JsonInclude.Include.NON_NULL) + public VariantMask heterozygousMask; + + @JsonProperty("hon") + @JsonInclude(JsonInclude.Include.NON_NULL) + public VariantMask homozygousNoCallMask; + + @JsonProperty("hen") + @JsonInclude(JsonInclude.Include.NON_NULL) + public VariantMask heterozygousNoCallMask; + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariableVariantMasks that = (VariableVariantMasks) o; + return Objects.equals(homozygousMask, that.homozygousMask) && Objects.equals(heterozygousMask, that.heterozygousMask) && Objects.equals(homozygousNoCallMask, that.homozygousNoCallMask) && Objects.equals(heterozygousNoCallMask, that.heterozygousNoCallMask); + } + + @Override + public int hashCode() { + return Objects.hash(homozygousMask, heterozygousMask, homozygousNoCallMask, heterozygousNoCallMask); + } + + public static BigInteger emptyBitmask(int length) { + BigInteger emptyBitmask = emptyBitmaskMap.get(length); + if (emptyBitmask == null) { + String emptyVariantMask = ""; + for (int i = 0; i < length; i++) { + emptyVariantMask = emptyVariantMask + "0"; + } + BigInteger newEmptyBitmask = new BigInteger("11" + emptyVariantMask + "11", 2); + emptyBitmaskMap.put(length, newEmptyBitmask); + return newEmptyBitmask; + } + return emptyBitmask; + } + + public static VariableVariantMasks append(VariableVariantMasks masks1, int length1, VariableVariantMasks masks2, int length2) { + VariableVariantMasks appendedMasks = new VariableVariantMasks(); + appendedMasks.homozygousMask = appendMask(masks1.homozygousMask, masks2.homozygousMask, length1, length2); + appendedMasks.heterozygousMask = appendMask(masks1.heterozygousMask, masks2.heterozygousMask, length1, length2); + appendedMasks.homozygousNoCallMask = appendMask(masks1.homozygousNoCallMask, masks2.homozygousNoCallMask, length1, length2); + appendedMasks.heterozygousNoCallMask = appendMask(masks1.heterozygousNoCallMask, masks2.heterozygousNoCallMask, length1, length2); + return appendedMasks; + } + + public static VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask2, int length1, int length2) { + if (variantMask1 == null) { + variantMask1 = VariantMask.emptyInstance(); + } + if (variantMask2 == null) { + variantMask2 = VariantMask.emptyInstance(); + } + if (variantMask1.equals(VariantMask.emptyInstance()) && variantMask2.equals(VariantMask.emptyInstance())) { + return null; + } + + if (variantMask1 instanceof VariantMaskSparseImpl) { + if (variantMask2 instanceof VariantMaskSparseImpl) { + return append((VariantMaskSparseImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); + } else if (variantMask2 instanceof VariantMaskBitmaskImpl) { + return append((VariantMaskSparseImpl) variantMask1, (VariantMaskBitmaskImpl) variantMask2, length1, length2); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + else if (variantMask1 instanceof VariantMaskBitmaskImpl) { + if (variantMask2 instanceof VariantMaskSparseImpl) { + return append((VariantMaskBitmaskImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); + } else if (variantMask2 instanceof VariantMaskBitmaskImpl) { + return append((VariantMaskBitmaskImpl) variantMask1, (VariantMaskBitmaskImpl) variantMask2, length1, length2); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + + private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { + BigInteger mask1 = emptyBitmask(length1); + for (Integer patientIndex : variantMask1.patientIndexes) { + mask1 = mask1.setBit(patientIndex + 2); + } + String binaryMask1 = mask1.toString(2); + + String binaryMask2 = variantMask2.bitmask.toString(2); + if (binaryMask2.length() - 4 != length2) { + throw new IllegalArgumentException("Bitmask does not match length (" + length2 + "): " + variantMask2.bitmask); + } + + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + + binaryMask1.substring(2); + return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + } + + private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { + String binaryMask1 = variantMask1.bitmask.toString(2); + if (binaryMask1.length() - 4 != length1) { + throw new IllegalArgumentException("Bitmask does not match length (" + length1 + "): " + variantMask1.bitmask); + } + + BigInteger mask2 = emptyBitmask(length2); + for (Integer patientId : variantMask2.patientIndexes) { + mask2 = mask2.setBit(patientId + 2); + } + String binaryMask2 = mask2.toString(2); + + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + + binaryMask1.substring(2); + return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + } + + private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { + String binaryMask1 = variantMask1.bitmask.toString(2); + String binaryMask2 = variantMask2.bitmask.toString(2); + + if (binaryMask1.length() - 4 != length1) { + throw new IllegalArgumentException("Bitmask does not match length (" + length1 + "): " + variantMask1.bitmask); + } + if (binaryMask2.length() - 4 != length2) { + throw new IllegalArgumentException("Bitmask does not match length (" + length2 + "): " + variantMask2.bitmask); + } + + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + + binaryMask1.substring(2); + BigInteger bitmask = new BigInteger(appendedString, 2); + return new VariantMaskBitmaskImpl(bitmask); + } + + private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { + if (variantMask1.patientIndexes.size() + variantMask2.patientIndexes.size() > SPARSE_VARIANT_THRESHOLD) { + BigInteger mask = emptyBitmask(length1 + length2); + for (Integer patientId : variantMask1.patientIndexes) { + mask = mask.setBit(patientId + 2); + } + // We start writing mask 2 where mask 1 ends. So the 0th index of mask 2 is now following the last bit of mask 1 + for (Integer patientId : variantMask2.patientIndexes) { + mask = mask.setBit(patientId + length1 + 2); + } + return new VariantMaskBitmaskImpl(mask); + } + else { + Set patientIndexSet = new HashSet<>(); + patientIndexSet.addAll(variantMask1.patientIndexes); + // The indexes for mask 2 are shifted by the length of mask 1, corresponding to the corresponding patient id array + // for mask 2 being appended to those of mask 1 + patientIndexSet.addAll(variantMask2.patientIndexes.stream().map(i -> i + length1).collect(Collectors.toSet())); + return new VariantMaskSparseImpl(patientIndexSet); + } + } + +} diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java new file mode 100644 index 00000000..86d06472 --- /dev/null +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java @@ -0,0 +1,36 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; + +import java.math.BigInteger; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +@JsonTypeInfo( + use = JsonTypeInfo.Id.NAME, + include = JsonTypeInfo.As.PROPERTY, + property = "type", + defaultImpl = VariantMaskBitmaskImpl.class +) +@JsonSubTypes({ + @JsonSubTypes.Type(value = VariantMaskBitmaskImpl.class, name = "b"), + @JsonSubTypes.Type(value = VariantMaskSparseImpl.class, name = "s") +}) +public interface VariantMask { + + VariantMask intersection(VariantMask variantMask); + + VariantMask union(VariantMask variantMask); + + boolean testBit(int bit); + + int bitCount(); + + static VariantMask emptyInstance() { + return new VariantMaskSparseImpl(Set.of()); + } + + Set patientMaskToPatientIdSet(List patientIds); +} diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java new file mode 100644 index 00000000..6195abde --- /dev/null +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -0,0 +1,106 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.fasterxml.jackson.databind.ser.std.ToStringSerializer; + +import java.math.BigInteger; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +public class VariantMaskBitmaskImpl implements VariantMask { + + + @JsonProperty("mask") + @JsonSerialize(using = ToStringSerializer.class) + protected final BigInteger bitmask; + + public BigInteger getBitmask() { + return bitmask; + } + + @JsonCreator + public VariantMaskBitmaskImpl(@JsonProperty("mask") String stringMask) { + this.bitmask = new BigInteger(stringMask); + } + + public VariantMaskBitmaskImpl(BigInteger bitmask) { + this.bitmask = bitmask; + } + + @Override + public VariantMask intersection(VariantMask variantMask) { + if (variantMask instanceof VariantMaskBitmaskImpl) { + return intersection((VariantMaskBitmaskImpl) variantMask); + } else if (variantMask instanceof VariantMaskSparseImpl) { + return variantMask.intersection(this); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + + @Override + public VariantMask union(VariantMask variantMask) { + if (variantMask instanceof VariantMaskBitmaskImpl) { + return union((VariantMaskBitmaskImpl) variantMask); + } else if (variantMask instanceof VariantMaskSparseImpl) { + return variantMask.union(this); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + + @Override + public boolean testBit(int bit) { + return bitmask.testBit(bit + 2); + } + + @Override + public int bitCount() { + return bitmask.bitCount(); + } + + @Override + public Set patientMaskToPatientIdSet(List patientIds) { + Set ids = new HashSet<>(); + for(int x = 0;x < bitmask.bitLength()-4;x++) { + if(testBit(x)) { + String patientId = patientIds.get(x).trim(); + ids.add(Integer.parseInt(patientId)); + } + } + return ids; + } + + private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) { + return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.or(this.bitmask)); + } + private VariantMask intersection(VariantMaskBitmaskImpl variantMaskBitmask) { + // we could consider using a sparse variant index here if we are ever going to be storing the + // result of this anywhere + return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.and(this.bitmask)); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantMaskBitmaskImpl that = (VariantMaskBitmaskImpl) o; + return Objects.equals(bitmask, that.bitmask); + } + + @Override + public int hashCode() { + return Objects.hash(bitmask); + } + + @Override + public String toString() { + return "VariantMaskBitmaskImpl{" + + "bitmask=" + bitmask.toString() + + '}'; + } +} diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java new file mode 100644 index 00000000..fb5e2a6a --- /dev/null +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -0,0 +1,88 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.math.BigInteger; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +public class VariantMaskSparseImpl implements VariantMask { + + @JsonProperty("p") + protected Set patientIndexes; + + public VariantMaskSparseImpl(@JsonProperty("p") Set patientIndexes) { + this.patientIndexes = patientIndexes; + } + + public Set getPatientIndexes() { + return patientIndexes; + } + + @Override + public VariantMask intersection(VariantMask variantMask) { + return new VariantMaskSparseImpl(this.patientIndexes.stream() + .filter(variantMask::testBit) + .collect(Collectors.toSet())); + } + + @Override + public VariantMask union(VariantMask variantMask) { + if (variantMask instanceof VariantMaskBitmaskImpl) { + return union((VariantMaskBitmaskImpl) variantMask); + } else if (variantMask instanceof VariantMaskSparseImpl) { + return union((VariantMaskSparseImpl) variantMask); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + + @Override + public boolean testBit(int bit) { + return patientIndexes.contains(bit); + } + + @Override + public int bitCount() { + return patientIndexes.size(); + } + + @Override + public Set patientMaskToPatientIdSet(List patientIds) { + return patientIndexes.stream() + .map(patientIds::get) + .map(String::trim) + .map(Integer::parseInt) + .collect(Collectors.toSet()); + } + + private VariantMask union(VariantMaskSparseImpl variantMask) { + HashSet union = new HashSet<>(variantMask.patientIndexes); + union.addAll(this.patientIndexes); + return new VariantMaskSparseImpl(union); + } + + private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) { + BigInteger union = variantMaskBitmask.bitmask; + for (Integer patientId : this.patientIndexes) { + union = union.setBit(patientId + 2); + } + return new VariantMaskBitmaskImpl(union); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantMaskSparseImpl that = (VariantMaskSparseImpl) o; + return Objects.equals(patientIndexes, that.patientIndexes); + } + + @Override + public int hashCode() { + return Objects.hash(patientIndexes); + } +} diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java index b244e407..83359789 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantSpec.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; import java.io.Serializable; +import java.util.Objects; import org.apache.commons.csv.CSVRecord; @@ -13,7 +14,22 @@ public class VariantCoords implements Serializable { public String ref; public String alt; public int qual; - public String format; + public String format; + public String gene; + public String consequence; + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantCoords that = (VariantCoords) o; + return qual == that.qual && Objects.equals(chromosome, that.chromosome) && Objects.equals(offset, that.offset) && Objects.equals(name, that.name) && Objects.equals(ref, that.ref) && Objects.equals(alt, that.alt) && Objects.equals(format, that.format) && Objects.equals(gene, that.gene) && Objects.equals(consequence, that.consequence); + } + + @Override + public int hashCode() { + return Objects.hash(chromosome, offset, name, ref, alt, qual, format, gene, consequence); + } } public static int CHR = 0, OFF = 1, NAME = 2, REF = 3, ALT = 4, QUAL = 5, FILTER = 6, INFO = 7, FORMAT = 8, DATA = 9; public long heteroOffset; @@ -32,6 +48,20 @@ public VariantSpec(CSVRecord r) { }catch(NumberFormatException e) { this.metadata.qual = -1; } + + String[] variantInfo = r.get(INFO).split("[=;]"); + String gene = "NULL"; + String consequence = "NULL"; + for (int i = 0; i < variantInfo.length; i = i + 2) { + if ("Gene_with_variant".equals(variantInfo[i])) { + gene = variantInfo[i + 1]; + } + if ("Variant_consequence_calculated".equals(variantInfo[i])) { + consequence = variantInfo[i + 1]; + } + } + this.metadata.gene = gene; + this.metadata.consequence = consequence; } public VariantSpec(String variant) { @@ -43,28 +73,48 @@ public VariantSpec(String variant) { this.metadata.ref = segments[2]; this.metadata.alt = segments[3]; this.metadata.qual = -1; + this.metadata.gene = segments[4]; + this.metadata.consequence = segments[5]; } public String specNotation() { return this.metadata.chromosome + "," + this.metadata.offset + "," + - this.metadata.ref + "," + this.metadata.alt; + this.metadata.ref + "," + this.metadata.alt + "," + this.metadata.gene + "," + this.metadata.consequence; } - @Override public int compareTo(VariantSpec o) { int ret = 0; ret = this.metadata.chromosome.compareTo(o.metadata.chromosome); - if(ret == 0) { + if (ret == 0) { ret = this.metadata.offset.compareTo(o.metadata.offset); } - if(ret == 0) { + if (ret == 0) { ret = this.metadata.ref.compareTo(o.metadata.ref); } - if(ret == 0) { + if (ret == 0) { ret = this.metadata.alt.compareTo(o.metadata.alt); } + if (ret == 0) { + ret = this.metadata.gene.compareTo(o.metadata.gene); + } + if (ret == 0) { + ret = this.metadata.consequence.compareTo(o.metadata.consequence); + } + return ret; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + VariantSpec that = (VariantSpec) o; + return heteroOffset == that.heteroOffset && homoOffset == that.homoOffset && Objects.equals(metadata, that.metadata); + } + + @Override + public int hashCode() { + return Objects.hash(heteroOffset, homoOffset, metadata); + } } \ No newline at end of file diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index 3efdc3df..c91ebf4d 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -30,13 +30,13 @@ public class VariantStore implements Serializable { private String[] vcfHeaders = new String[24]; - private Map>> variantMaskStorage = new TreeMap<>(); + private Map>> variantMaskStorage = new TreeMap<>(); - public Map>> getVariantMaskStorage() { + public Map>> getVariantMaskStorage() { return variantMaskStorage; } - public void setVariantMaskStorage(Map>> variantMaskStorage) { + public void setVariantMaskStorage(Map>> variantMaskStorage) { this.variantMaskStorage = variantMaskStorage; } @@ -81,24 +81,24 @@ public Map countVariants() { TreeMap counts = new TreeMap<>(); for (String contig : variantMaskStorage.keySet()) { counts.put(contig, new int[5]); - FileBackedJsonIndexStorage> storage = variantMaskStorage + FileBackedJsonIndexStorage> storage = variantMaskStorage .get(contig); storage.keys().stream().forEach((Integer key) -> { int[] contigCounts = counts.get(contig); - Collection values = storage.get(key).values(); - contigCounts[0] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + Collection values = storage.get(key).values(); + contigCounts[0] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.heterozygousMask != null ? 1 : 0; })); - contigCounts[1] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[1] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.homozygousMask != null ? 1 : 0; })); - contigCounts[2] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[2] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.heterozygousNoCallMask != null ? 1 : 0; })); - contigCounts[3] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[3] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.homozygousNoCallMask != null ? 1 : 0; })); - contigCounts[4] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + contigCounts[4] += values.stream().collect(Collectors.summingInt((VariableVariantMasks masks) -> { return masks.heterozygousMask != null || masks.homozygousMask != null || masks.heterozygousNoCallMask != null || masks.homozygousNoCallMask != null ? 1 : 0; })); @@ -115,7 +115,7 @@ public String[] getPatientIds() { return patientIds; } - public Optional getMasks(String variant, VariantBucketHolder bucketCache) throws IOException { + public Optional getMasks(String variant, VariantBucketHolder bucketCache) throws IOException { String[] segments = variant.split(","); if (segments.length < 2) { log.error("Less than 2 segments found in this variant : " + variant); @@ -134,7 +134,7 @@ public Optional getMasks(String variant, VariantBucketHolder> indexedStorage = variantMaskStorage.get(contig); + FileBackedJsonIndexStorage> indexedStorage = variantMaskStorage.get(contig); if (indexedStorage == null) { return Optional.empty(); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java index 9d326a8c..bdd94a51 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.storage; import com.fasterxml.jackson.core.type.TypeReference; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; @@ -9,17 +10,17 @@ import java.io.Serializable; import java.util.concurrent.ConcurrentHashMap; -public class FileBackedStorageVariantMasksImpl extends FileBackedJsonIndexStorage> implements Serializable { +public class FileBackedStorageVariantMasksImpl extends FileBackedJsonIndexStorage> implements Serializable { private static final long serialVersionUID = -1086729119489479152L; public FileBackedStorageVariantMasksImpl(File storageFile) throws FileNotFoundException { super(storageFile); } - private static final TypeReference> typeRef - = new TypeReference>() {}; + private static final TypeReference> typeRef + = new TypeReference>() {}; @Override - public TypeReference> getTypeReference() { + public TypeReference> getTypeReference() { return typeRef; } } diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java new file mode 100644 index 00000000..6d00f538 --- /dev/null +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java @@ -0,0 +1,95 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import org.junit.jupiter.api.Test; + +import java.math.BigInteger; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +class VariableVariantMasksTest { + + @Test + void appendMask_bitmaskImpl() { + BigInteger bitmask1 = new BigInteger("141128266537977664750676276621047930360647583370951385898970922432171641921591683601425661017874251620531446414001258481539117879583763813143907296662202568713257464719895142475222938754588184582833582945368733841208297077207159012919514593576142503835920156756519924916168242089239884330492690734761017788145472426679068150988914755013607752960816306592042734726724967292489382922774340753414934719473579642086577795740766564256929075326123036793589965882761953873677505969126336030099232419417531157749845223962002657158936982730111297974575206081858525359109608528262092083167977823315667519770531233235502931655993522963196732342798609807510543882273123542784003837220002062547832711681922161218010724226003150184777238854967623299970205873235909960398972219305810619458430315886318451687126866584390511424196256376003247717440484524668930504599413091194844245946142655802347109092922312106655798606759916931458730259588797621736772069847398957694392864416289605234581063232645315000563794292589544477569850783743632035729362609737922579552116678396740867191143283379987783383546964540333832022555110531730782339840166704631278663393147496643156685747417468473443887224102514355150110201463928311558935840684668441340082266117306282514075566574248767903320202249460451453420775463375086967869642395460246215345716051454824012618762098616749770441010729935100690497726813080659572063353154599964806728241328892738034442472863741591198849392657539785115946383059858121434147389809099727185407701653069953116828987459279642700621389676861517273949473071786975738343769782464350475716221573322857987917669867491560145456280976454077958400126376423873081869441464609193216436634914151839730185428390533788824490041131307997850622739795832813220163889874229711053167483400593799762457890098932032534121358716659875839129363445294836150352418351095797845291006220927734160887236196809213101473795"); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + BigInteger bitmask2 = new BigInteger("30986998537042903080350519548408047200882037374471443267447678987916097706689940814664693588471382927433911322922595655683"); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask2); + BigInteger expected = new BigInteger("364428449062309565827240887371711117987574656019257508470944944605332301145049360947562613021825062352285550919581217473685332023988542714833984833584283993391279871691386067405572061472644828810015780133411925154335939452908415258964249206427971383684062861395477403040282307619178797392994855011010859741065713642937722555386553472167057412349499978682577859360114784650082282703011601887907648540462571623667509259030122346885116531561265495702316349933315144109402391832141090339015610768872781857606433548246400953322494469603219947770538558126916689855755477351331412849701993441724311536424509298383050084506206211767917732037267909332303742403672014600658318302593423194411973298402659688783532220840192809967726287753517722498547930875524876017589557370530361981799944335286739781131150443454009586308444310074980028948209472950776329093810066130759876455221806685349198404676070598652587630369814761497643023695920944554335463560556407863991874650393356359742532476840044738380631950001000889884841365536456755318891308758448828432322546827241786950989285280451703830359048521243704376576346418597615317052525572950500220422542100551941365526524572447973251506942320379483402120949518538372004537133062107782710822000749441206410146350956094013529342546401955361956752745578930129151472060056501664176391306323733630924234020572479814310832686356211870629168524382442619274880885230138170761194502147197237097179031772802871639774718532255582374871483153552524216278948609829889298505566944209429410943844015411379772232511986448333683276938151761604643176820833900087800015896793760190738573447504231241253241247411492029282990960833205840126804844971397564707229362865231720213379844987578324282466174232636950775929742330163059024377711272017786328444659381735736216404131832568759328992585474109193095809959345744181054394252160618715444166335074662636279289524790651946664188322635565248044921668500523867959008028963420119948417112017996816651287475428019005246560397815063629879127215220996440067"); + // length should equal the length of the binary representation - 4. Bitmasks are padded with 11 on each end + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 6282, 400); + + assertEquals(expected, ((VariantMaskBitmaskImpl)merged).getBitmask()); + } + @Test + void appendMask_bitmaskImpl_invalidLength() { + BigInteger bitmask1 = new BigInteger("141128266537977664750676276621047930360647583370951385898970922432171641921591683601425661017874251620531446414001258481539117879583763813143907296662202568713257464719895142475222938754588184582833582945368733841208297077207159012919514593576142503835920156756519924916168242089239884330492690734761017788145472426679068150988914755013607752960816306592042734726724967292489382922774340753414934719473579642086577795740766564256929075326123036793589965882761953873677505969126336030099232419417531157749845223962002657158936982730111297974575206081858525359109608528262092083167977823315667519770531233235502931655993522963196732342798609807510543882273123542784003837220002062547832711681922161218010724226003150184777238854967623299970205873235909960398972219305810619458430315886318451687126866584390511424196256376003247717440484524668930504599413091194844245946142655802347109092922312106655798606759916931458730259588797621736772069847398957694392864416289605234581063232645315000563794292589544477569850783743632035729362609737922579552116678396740867191143283379987783383546964540333832022555110531730782339840166704631278663393147496643156685747417468473443887224102514355150110201463928311558935840684668441340082266117306282514075566574248767903320202249460451453420775463375086967869642395460246215345716051454824012618762098616749770441010729935100690497726813080659572063353154599964806728241328892738034442472863741591198849392657539785115946383059858121434147389809099727185407701653069953116828987459279642700621389676861517273949473071786975738343769782464350475716221573322857987917669867491560145456280976454077958400126376423873081869441464609193216436634914151839730185428390533788824490041131307997850622739795832813220163889874229711053167483400593799762457890098932032534121358716659875839129363445294836150352418351095797845291006220927734160887236196809213101473795"); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + BigInteger bitmask2 = new BigInteger("30986998537042903080350519548408047200882037374471443267447678987916097706689940814664693588471382927433911322922595655683"); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask2); + BigInteger expected = new BigInteger("364428449062309565827240887371711117987574656019257508470944944605332301145049360947562613021825062352285550919581217473685332023988542714833984833584283993391279871691386067405572061472644828810015780133411925154335939452908415258964249206427971383684062861395477403040282307619178797392994855011010859741065713642937722555386553472167057412349499978682577859360114784650082282703011601887907648540462571623667509259030122346885116531561265495702316349933315144109402391832141090339015610768872781857606433548246400953322494469603219947770538558126916689855755477351331412849701993441724311536424509298383050084506206211767917732037267909332303742403672014600658318302593423194411973298402659688783532220840192809967726287753517722498547930875524876017589557370530361981799944335286739781131150443454009586308444310074980028948209472950776329093810066130759876455221806685349198404676070598652587630369814761497643023695920944554335463560556407863991874650393356359742532476840044738380631950001000889884841365536456755318891308758448828432322546827241786950989285280451703830359048521243704376576346418597615317052525572950500220422542100551941365526524572447973251506942320379483402120949518538372004537133062107782710822000749441206410146350956094013529342546401955361956752745578930129151472060056501664176391306323733630924234020572479814310832686356211870629168524382442619274880885230138170761194502147197237097179031772802871639774718532255582374871483153552524216278948609829889298505566944209429410943844015411379772232511986448333683276938151761604643176820833900087800015896793760190738573447504231241253241247411492029282990960833205840126804844971397564707229362865231720213379844987578324282466174232636950775929742330163059024377711272017786328444659381735736216404131832568759328992585474109193095809959345744181054394252160618715444166335074662636279289524790651946664188322635565248044921668500523867959008028963420119948417112017996816651287475428019005246560397815063629879127215220996440067"); + // length should equal the length of the binary representation - 4. Bitmasks are padded with 11 on each end + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 6282, 400); + + assertThrows( + IllegalArgumentException.class, + () -> VariableVariantMasks.appendMask(mask1, mask2, 6283, 400) + ); + assertThrows( + IllegalArgumentException.class, + () -> VariableVariantMasks.appendMask(mask1, mask2, 6282, 399) + ); + } + + @Test + void appendMask_sparseImpl() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 99)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + VariantMask expected = new VariantMaskSparseImpl(Set.of(1, 10, 99, 101, 199)); + assertEquals(expected, merged); + } + @Test + void appendMask_sparseImplOneEmpty() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of()); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 99)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + VariantMask expected = new VariantMaskSparseImpl(Set.of(101, 199)); + assertEquals(expected, merged); + + mask1 = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + mask2 = new VariantMaskSparseImpl(Set.of()); + merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + expected = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + assertEquals(expected, merged); + } + + @Test + void appendMask_sparseBitmask() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(1, 9)); + BigInteger bitmask1 = new BigInteger("11100000010011", 2); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask1); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 10, 10); + + BigInteger bitmask = new BigInteger("111000000100" + "100000001011", 2); + VariantMask expected = new VariantMaskBitmaskImpl(bitmask); + System.out.println(((VariantMaskBitmaskImpl) merged).getBitmask().toString(2)); + assertEquals(expected, merged); + } + + @Test + void appendMask_bitmaskSparse() { + BigInteger bitmask1 = new BigInteger("11100000010011", 2); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 9)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 10, 10); + + BigInteger bitmask = new BigInteger("111000000010" + "100000010011", 2); + VariantMask expected = new VariantMaskBitmaskImpl(bitmask); + System.out.println(((VariantMaskBitmaskImpl) merged).getBitmask().toString(2)); + assertEquals(expected, merged); + } +} \ No newline at end of file diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java new file mode 100644 index 00000000..aa470219 --- /dev/null +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java @@ -0,0 +1,65 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import java.math.BigInteger; +import java.util.Random; +import java.util.Set; + +public class VariantMaskPerformanceTest { + + /** + * This test shows the ideal maximum size for a sparse variant is between 5-10 items. The slight decrease in performance + * is generally worth the drastic reduction in disks space + */ + //@Test + public void test() { + VariantMaskBitmaskImpl mask100k = new VariantMaskBitmaskImpl(generateRandomBitmask(100_000)); + VariantMaskBitmaskImpl mask100k2 = new VariantMaskBitmaskImpl(generateRandomBitmask(100_000)); + VariantMaskBitmaskImpl mask1m = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); + VariantMaskBitmaskImpl mask1m2 = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); + VariantMaskSparseImpl sparseMask100k = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000)); + VariantMaskSparseImpl sparseMask100k2 = new VariantMaskSparseImpl(Set.of(100, 101, 200, 300, 400, 1000, 20_000, 30_000, 50_000, 90_000)); + VariantMaskSparseImpl sparseMask1m = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000, 300_000, 420_000, 555_555, 867_530, 999_999)); + + long time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = mask100k.union(mask100k2); + } + System.out.println(mask100k.getBitmask().bitLength() + " bitmask union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask100k.union(mask100k); + } + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask100k2.union(mask100k); + } + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k2.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = mask1m.union(mask1m2); + } + System.out.println(mask1m.getBitmask().bitLength() + " bitmask union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask100k.union(mask1m); + } + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + time = System.currentTimeMillis(); + for (int k = 0; k < 1000; k++) { + VariantMask and = sparseMask1m.union(mask1m); + } + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask1m.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + + } + + private BigInteger generateRandomBitmask(int patients) { + return new BigInteger(patients + 4, new Random()); + } + +} diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskTest.java new file mode 100644 index 00000000..d6ad8b4a --- /dev/null +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskTest.java @@ -0,0 +1,83 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; + +import org.junit.jupiter.api.Test; + +import java.math.BigInteger; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class VariantMaskTest { + + @Test + public void intersection_bitmaskVsBitmask() { + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111001100011", 2)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("111010010011", 2)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111000000011", 2)); + + assertEquals(expected, mask1.intersection(mask2)); + } + @Test + public void intersection_bitmaskVsSparse() { + // this is essentially a mask for patients 0, 2, 3, 7 (there is 11 padding on both ends) + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111000110111", 2)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(0, 3, 6)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(0, 3)); + + assertEquals(expected, mask1.intersection(mask2)); + } + @Test + public void intersection_sparseVsBitmask() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(4, 7)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("110111110111", 2)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(4)); + + assertEquals(expected, mask1.intersection(mask2)); + } + + @Test + public void intersection_sparseVsSparse() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(0, 2, 4, 6)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(0, 1, 3, 5, 7)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(0)); + + assertEquals(expected, mask1.intersection(mask2)); + } + + @Test + public void union_bitmaskVsBitmask() { + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111001100011", 2)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("111010010011", 2)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111011110011", 2)); + + assertEquals(expected, mask1.union(mask2)); + } + + @Test + public void union_bitmaskVsSparse() { + // this is essentially a mask for patients 0, 2, 3, 7 (there is 11 padding on both ends) + VariantMask mask1 = new VariantMaskBitmaskImpl(new BigInteger("111000110111", 2)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(0, 3, 6)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111100110111", 2)); + + assertEquals(expected, mask1.union(mask2)); + } + + @Test + public void union_sparseVsBitmask() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(4, 7)); + VariantMask mask2 = new VariantMaskBitmaskImpl(new BigInteger("110111110111", 2)); + VariantMask expected = new VariantMaskBitmaskImpl(new BigInteger("111111110111", 2)); + + assertEquals(expected, mask1.union(mask2)); + } + + @Test + public void union_sparseVsSparse() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(0, 2, 4, 6)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 5, 7)); + VariantMask expected = new VariantMaskSparseImpl(Set.of(0, 1, 2, 4, 5, 6, 7)); + + assertEquals(expected, mask1.union(mask2)); + } +} diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java index d3b12fc2..6bdbb3fc 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java @@ -5,6 +5,7 @@ import org.junit.jupiter.api.Test; import java.math.BigInteger; +import java.util.Set; import java.util.concurrent.ThreadLocalRandom; import static org.junit.jupiter.api.Assertions.*; @@ -16,25 +17,25 @@ public void testFieldMaxLength() throws JsonProcessingException { VariantMasks variantMasks = new VariantMasks(); StringBuilder homozygousStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { homozygousStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.homozygousMask = new BigInteger(homozygousStr.toString()); StringBuilder heterozygousStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { heterozygousStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.heterozygousMask = new BigInteger(heterozygousStr.toString()); StringBuilder homozygousNoCallStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { homozygousNoCallStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.homozygousNoCallMask = new BigInteger(homozygousNoCallStr.toString()); StringBuilder heterozygousNoCallStr = new StringBuilder(); - for (int i = 0 ; i < 5000; i++) { + for (int i = 0 ; i < 50000; i++) { heterozygousNoCallStr.append(ThreadLocalRandom.current().nextInt(0, 10)); } variantMasks.heterozygousNoCallMask = new BigInteger(heterozygousNoCallStr.toString()); @@ -45,4 +46,22 @@ public void testFieldMaxLength() throws JsonProcessingException { assertEquals(variantMasks, deserialized); } + + + + @Test + public void testVariableVariantMasks() throws JsonProcessingException { + VariableVariantMasks variableVariantMasks = new VariableVariantMasks(); + variableVariantMasks.heterozygousMask = new VariantMaskSparseImpl(Set.of(1, 2, 3)); + variableVariantMasks.homozygousMask = new VariantMaskBitmaskImpl(new BigInteger("1101010101010101011")); + variableVariantMasks.heterozygousNoCallMask = new VariantMaskSparseImpl(Set.of()); + variableVariantMasks.homozygousNoCallMask = null; + + ObjectMapper objectMapper = new ObjectMapper(); + String serialized = objectMapper.writeValueAsString(variableVariantMasks); + System.out.println(serialized); + VariableVariantMasks deserialized = objectMapper.readValue(serialized, VariableVariantMasks.class); + + assertEquals(variableVariantMasks, deserialized); + } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 88b4fd14..ae3a06e6 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -1,5 +1,6 @@ package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; +import com.google.common.base.Joiner; import com.google.common.collect.Sets; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantMasksImpl; @@ -9,14 +10,11 @@ import org.slf4j.LoggerFactory; import java.io.*; -import java.math.BigInteger; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListSet; import java.util.stream.Collectors; import java.util.stream.Stream; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; public class GenomicDatasetMerger { @@ -25,6 +23,9 @@ public class GenomicDatasetMerger { private final VariantStore variantStore1; private final VariantStore variantStore2; + private final int variantStore1PatientCount; + private final int variantStore2PatientCount; + private final Map infoStores1; private final Map infoStores2; @@ -34,7 +35,9 @@ public class GenomicDatasetMerger { public GenomicDatasetMerger(VariantStore variantStore1, VariantStore variantStore2, Map infoStores1, Map infoStores2, String outputDirectory) { this.variantStore1 = variantStore1; + this.variantStore1PatientCount = variantStore1.getPatientIds().length; this.variantStore2 = variantStore2; + this.variantStore2PatientCount = variantStore2.getPatientIds().length; this.mergedVariantStore = new VariantStore(); this.infoStores1 = infoStores1; this.infoStores2 = infoStores2; @@ -55,11 +58,13 @@ private void validate() { } } - public VariantStore mergeVariantStore(Map>> mergedChromosomeMasks) { + public VariantStore mergeVariantStore(Map>> mergedChromosomeMasks) { mergedVariantStore.setVariantMaskStorage(mergedChromosomeMasks); - mergedVariantStore.setPatientIds(mergePatientIds()); return mergedVariantStore; } + public void mergePatients() { + mergedVariantStore.setPatientIds(mergePatientIds()); + } /** * Since both sets of variant indexes reference a different variant spec list, we need to re-index @@ -88,7 +93,7 @@ public Map mergeVariantIndexes() throws String[] variantIndex2 = variantStore2.getVariantSpecIndex(); Map variantSpecToIndexMap = new HashMap<>(); - LinkedList variantSpecList = new LinkedList<>(Arrays.asList(variantIndex1)); + LinkedList variantSpecList = new LinkedList<>(List.of(variantIndex1)); for (int i = 0; i < variantIndex1.length; i++) { variantSpecToIndexMap.put(variantIndex1[i], i); } @@ -119,7 +124,7 @@ public Map mergeVariantIndexes() throws throw new IllegalStateException("Info stores do not match"); } //for (Map.Entry infoStores1Entry : infoStores1.entrySet()) { - infoStores1.entrySet().parallelStream().forEach(infoStores1Entry -> { + infoStores1.entrySet().stream().forEach(infoStores1Entry -> { FileBackedByteIndexedInfoStore infoStore2 = infoStores2.get(infoStores1Entry.getKey()); FileBackedByteIndexedStorage allValuesStore1 = infoStores1Entry.getValue().getAllValues(); @@ -165,8 +170,8 @@ private String[] mergePatientIds() { * For each chromosome, call mergeChromosomeMask to merge the masks * @return */ - public Map>> mergeChromosomeMasks() { - Map>> mergedMaskStorage = new ConcurrentHashMap<>(); + public Map>> mergeChromosomeMasks() { + Map>> mergedMaskStorage = new ConcurrentHashMap<>(); variantStore1.getVariantMaskStorage().keySet().forEach(chromosome -> { try { mergedMaskStorage.put(chromosome, mergeChromosomeMask(chromosome)); @@ -219,77 +224,74 @@ public Map> mergeChromosomeMask(String chromosome) throws FileNotFoundException { - FileBackedJsonIndexStorage> variantMaskStorage1 = variantStore1.getVariantMaskStorage().get(chromosome); - FileBackedJsonIndexStorage> variantMaskStorage2 = variantStore2.getVariantMaskStorage().get(chromosome); + public FileBackedJsonIndexStorage> mergeChromosomeMask(String chromosome) throws FileNotFoundException { + FileBackedJsonIndexStorage> variantMaskStorage1 = variantStore1.getVariantMaskStorage().get(chromosome); + FileBackedJsonIndexStorage> variantMaskStorage2 = variantStore2.getVariantMaskStorage().get(chromosome); - FileBackedJsonIndexStorage> merged = new FileBackedStorageVariantMasksImpl(new File(outputDirectory + chromosome + "masks.bin")); - variantMaskStorage1.keys().parallelStream().forEach(key -> { - Map masks1 = variantMaskStorage1.get(key); - Map masks2 = variantMaskStorage2.get(key); + FileBackedJsonIndexStorage> merged = new FileBackedStorageVariantMasksImpl(new File(outputDirectory + chromosome + "masks.bin")); + variantMaskStorage1.keys().stream().forEach(key -> { + Map masks1 = variantMaskStorage1.get(key); + Map masks2 = variantMaskStorage2.get(key); if (masks2 == null) { masks2 = Map.of(); } - ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); - for (Map.Entry entry : masks1.entrySet()) { - VariantMasks variantMasks2 = masks2.get(entry.getKey()); + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + for (Map.Entry entry : masks1.entrySet()) { + VariableVariantMasks variantMasks2 = masks2.get(entry.getKey()); if (variantMasks2 == null) { // this will have all null masks, which will result in null when // appended to a null, or be replaced with an empty bitmask otherwise - variantMasks2 = new VariantMasks(); + variantMasks2 = new VariableVariantMasks(); } - mergedMasks.put(entry.getKey(), append(entry.getValue(), variantMasks2)); + + VariableVariantMasks mergeResult = VariableVariantMasks.append(entry.getValue(), variantStore1PatientCount, variantMasks2, variantStore2PatientCount); + mergedMasks.put(entry.getKey(), mergeResult); } // Any entry in the second set that is not in the merged set can be merged with an empty variant mask, // if there were a corresponding entry in set 1, it would have been merged in the previous loop - for (Map.Entry entry : masks2.entrySet()) { + for (Map.Entry entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); + VariableVariantMasks appendedMasks = VariableVariantMasks.append(new VariableVariantMasks(), variantStore1PatientCount, entry.getValue(), variantStore2PatientCount); + mergedMasks.put(entry.getKey(), appendedMasks); } } - merged.put(key, mergedMasks); + if (merged.keys().contains(key)) { + log.warn("Merged already contains key: " + key); + } else { + merged.put(key, mergedMasks); + } }); - ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); variantMaskStorage2.keys().forEach(key -> { - Map masks2 = variantMaskStorage2.get(key); - for (Map.Entry entry : masks2.entrySet()) { - if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); + if (variantMaskStorage1.get(key) == null) { + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + Map masks2 = variantMaskStorage2.get(key); + for (Map.Entry entry : masks2.entrySet()) { + if (!mergedMasks.containsKey(entry.getKey())) { + VariableVariantMasks appendedMasks = VariableVariantMasks.append(new VariableVariantMasks(), variantStore1PatientCount, entry.getValue(), variantStore2PatientCount); + mergedMasks.put(entry.getKey(), appendedMasks); + } + } + if (merged.keys().contains(key)) { + log.warn("Second loop: merged already contains key: " + key); + } else { + merged.put(key, mergedMasks); } } }); - return merged; - } - public VariantMasks append(VariantMasks variantMasks1, VariantMasks variantMasks2) { - VariantMasks appendedMasks = new VariantMasks(); - appendedMasks.homozygousMask = appendMask(variantMasks1.homozygousMask, variantMasks2.homozygousMask); - appendedMasks.heterozygousMask = appendMask(variantMasks1.heterozygousMask, variantMasks2.heterozygousMask); - appendedMasks.homozygousNoCallMask = appendMask(variantMasks1.homozygousNoCallMask, variantMasks2.homozygousNoCallMask); - appendedMasks.heterozygousNoCallMask = appendMask(variantMasks1.heterozygousNoCallMask, variantMasks2.heterozygousNoCallMask); - return appendedMasks; - } - - /** - * Appends one mask to another. This assumes the masks are both padded with '11' on each end - * to prevent overflow issues. - */ - public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { - if (mask1 == null && mask2 == null) { - return null; - } - if (mask1 == null) { - mask1 = variantStore1.emptyBitmask(); + if (log.isTraceEnabled()) { + merged.keys().stream().sorted().limit(3).forEach(key -> { + ConcurrentHashMap maskMap = merged.get(key); + maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { + VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); + VariantMask heteroMask = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()); + Set patientsWithVariant = heteroMask.patientMaskToPatientIdSet(List.of(mergedVariantStore.getPatientIds())); + log.trace("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); + }); + }); } - if (mask2 == null) { - mask2 = variantStore2.emptyBitmask(); - } - String binaryMask1 = mask1.toString(2); - String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2); - return new BigInteger(appendedString, 2); + return merged; } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java index 70565730..ba3b2936 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java @@ -2,6 +2,7 @@ import com.google.common.base.Preconditions; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; @@ -45,8 +46,9 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio Map infoStores2 = loadInfoStores(genomicDirectory2); GenomicDatasetMerger genomicDatasetMerger = new GenomicDatasetMerger(VariantStore.readInstance(genomicDirectory1),VariantStore.readInstance(genomicDirectory2), infoStores1, infoStores2, outputDirectory); + genomicDatasetMerger.mergePatients(); - Map>> mergedChromosomeMasks = genomicDatasetMerger.mergeChromosomeMasks(); + Map>> mergedChromosomeMasks = genomicDatasetMerger.mergeChromosomeMasks(); VariantStore mergedVariantStore = genomicDatasetMerger.mergeVariantStore(mergedChromosomeMasks); Map variantIndexes = genomicDatasetMerger.mergeVariantIndexes(); diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 7727aba2..1aed473e 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -8,6 +8,7 @@ import java.util.stream.Collectors; import java.util.zip.GZIPOutputStream; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantMasksImpl; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; import org.apache.commons.csv.CSVFormat; @@ -16,9 +17,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoStore; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; import htsjdk.samtools.util.BlockCompressedInputStream; @@ -60,7 +58,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException private static HashMap zygosityMaskStrings; - private static TreeMap>> variantMaskStorage = new TreeMap<>(); + private static TreeMap>> variantMaskStorage = new TreeMap<>(); private static long startTime; @@ -180,7 +178,7 @@ private static void loadVCFs(File indexFile) throws IOException { int[] count = { 0 }; for (String contig : store.getVariantMaskStorage().keySet()) { ArrayList chunkIds = new ArrayList<>(); - FileBackedByteIndexedStorage> chromosomeStorage = store.getVariantMaskStorage() + FileBackedByteIndexedStorage> chromosomeStorage = store.getVariantMaskStorage() .get(contig); if (chromosomeStorage != null) { // print out the top and bottom 50 variants in the store (that have masks) @@ -188,11 +186,11 @@ private static void loadVCFs(File indexFile) throws IOException { for (Integer chunkId : chunkIds) { for (String variantSpec : chromosomeStorage.get(chunkId).keySet()) { count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + VariableVariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; + VariantMask heterozygousMask = variantMasks.heterozygousMask; String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; + VariantMask homozygousMask = variantMasks.homozygousMask; String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) @@ -210,11 +208,11 @@ private static void loadVCFs(File indexFile) throws IOException { int chunkId = chunkIds.get(x); chromosomeStorage.get(chunkId).keySet().forEach((variantSpec) -> { count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + VariableVariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; + VariantMask heterozygousMask = variantMasks.heterozygousMask; String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; + VariantMask homozygousMask = variantMasks.homozygousMask; String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) @@ -234,16 +232,23 @@ private static void loadVCFs(File indexFile) throws IOException { saveVariantStore(store, variantMaskStorage); } - private static String sampleIdsForMask(String[] sampleIds, BigInteger heterozygousMask) { - String idList = ""; - if (heterozygousMask != null) { - for (int x = 2; x < heterozygousMask.bitLength() - 2; x++) { - if (heterozygousMask.testBit(heterozygousMask.bitLength() - 1 - x)) { - idList += sampleIds[x - 2] + ","; + private static String sampleIdsForMask(String[] sampleIds, VariantMask variantMask) { + StringBuilder idList = new StringBuilder(); + if (variantMask != null) { + if (variantMask instanceof VariantMaskBitmaskImpl) { + BigInteger mask = ((VariantMaskBitmaskImpl) variantMask).getBitmask(); + for (int x = 0; x < mask.bitLength() - 4; x++) { + if (variantMask.testBit(x)) { + idList.append(sampleIds[x]).append(","); + } + } + } else if (variantMask instanceof VariantMaskSparseImpl) { + for (Integer patientIndex : ((VariantMaskSparseImpl) variantMask).getPatientIndexes()) { + idList.append(sampleIds[patientIndex]).append(","); } } } - return idList; + return idList.toString(); } private static void flipChunk(String lastContigProcessed, int lastChunkProcessed, int currentChunk, @@ -276,7 +281,7 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } if (!currentContig.contentEquals(lastContigProcessed) || currentChunk > lastChunkProcessed || isLastChunk) { // flip chunk - TreeMap>> variantMaskStorage_f = variantMaskStorage; + TreeMap>> variantMaskStorage_f = variantMaskStorage; HashMap zygosityMaskStrings_f = zygosityMaskStrings; String lastContigProcessed_f = lastContigProcessed; int lastChunkProcessed_f = lastChunkProcessed; @@ -306,10 +311,10 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } private static void saveVariantStore(VariantStore store, - TreeMap>> variantMaskStorage) + TreeMap>> variantMaskStorage) throws IOException, FileNotFoundException { store.setVariantMaskStorage(variantMaskStorage); - for (FileBackedByteIndexedStorage> storage : variantMaskStorage + for (FileBackedByteIndexedStorage> storage : variantMaskStorage .values()) { if (storage != null) storage.complete(); @@ -361,11 +366,11 @@ private static void shutdownChunkWriteExecutor() { } } - private static ConcurrentHashMap convertLoadingMapToMaskMap( + private static ConcurrentHashMap convertLoadingMapToMaskMap( HashMap zygosityMaskStrings_f) { - ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); + ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); zygosityMaskStrings_f.entrySet().parallelStream().forEach((entry) -> { - maskMap.put(entry.getKey(), new VariantMasks(entry.getValue())); + maskMap.put(entry.getKey(), new VariableVariantMasks(entry.getValue())); }); return maskMap; } @@ -465,8 +470,19 @@ private void setMasksForSample(char[][] zygosityMaskStrings, int index, int star } private String currentSpecNotation() { + String[] variantInfo = currentLineSplit[7].split("[=;]"); + String gene = "NULL"; + String consequence = "NULL"; + for (int i = 0; i < variantInfo.length; i = i + 2) { + if ("Gene_with_variant".equals(variantInfo[i])) { + gene = variantInfo[i + 1]; + } + if ("Variant_consequence_calculated".equals(variantInfo[i])) { + consequence = variantInfo[i + 1]; + } + } return currentLineSplit[0] + "," + currentLineSplit[1] + "," + currentLineSplit[3] + "," - + currentLineSplit[4]; + + currentLineSplit[4] + "," + gene + "," + consequence; } public void readHeaders(ConcurrentHashMap infoStores) throws IOException { diff --git a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java index df340f84..58e1d48a 100644 --- a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java +++ b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndexTest.java @@ -9,6 +9,7 @@ import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.VariantMetadataLoader; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.test.context.event.annotation.BeforeTestClass; @@ -16,6 +17,8 @@ import static org.springframework.test.util.AssertionErrors.assertEquals; import static org.springframework.test.util.AssertionErrors.assertNotNull; +// todo: enable when variant explorer is implemented +@Disabled public class VariantMetadataIndexTest { //From file 1 14 19038291 rs550062154 C A diff --git a/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java b/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java index ca08b584..f6033fc7 100644 --- a/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java +++ b/genomic-processor/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/genomic/GenomicProcessorController.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.genomic; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.processing.DistributableQuery; import edu.harvard.hms.dbmi.avillach.hpds.processing.GenomicProcessor; import org.springframework.beans.factory.annotation.Autowired; @@ -24,7 +25,7 @@ public GenomicProcessorController(GenomicProcessor genomicProcessor) { } @PostMapping("/patients") - public Mono queryForPatientMask(@RequestBody DistributableQuery distributableQuery) { + public Mono queryForPatientMask(@RequestBody DistributableQuery distributableQuery) { return genomicProcessor.getPatientMask(distributableQuery); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java index 659aea5a..21bb70ea 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java @@ -133,7 +133,7 @@ protected Set idSetsForEachFilter(Query query) { // NULL (representing no phenotypic filters, i.e. all patients) or not empty patient ID sets require a genomic query. // Otherwise, short circuit and return no patients if ((distributableQuery.getPatientIds() == null || !distributableQuery.getPatientIds().isEmpty()) && distributableQuery.hasFilters()) { - Mono patientMaskForVariantInfoFilters = genomicProcessor.getPatientMask(distributableQuery); + Mono patientMaskForVariantInfoFilters = genomicProcessor.getPatientMask(distributableQuery); return patientMaskForVariantInfoFilters.map(genomicProcessor::patientMaskToPatientIdSet).block(); } return distributableQuery.getPatientIds(); @@ -360,12 +360,12 @@ public List getPatientIds() { return genomicProcessor.getPatientIds(); } - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { return genomicProcessor.getMasks(path, variantMasksVariantBucketHolder); } // todo: handle this locally, we do not want this in the genomic processor - protected BigInteger createMaskForPatientSet(Set patientSubset) { + protected VariantMask createMaskForPatientSet(Set patientSubset) { return genomicProcessor.createMaskForPatientSet(patientSubset); } } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java index 754f5ab0..8759338b 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java @@ -1,6 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import reactor.core.publisher.Mono; @@ -12,17 +14,17 @@ import java.util.Set; public interface GenomicProcessor { - Mono getPatientMask(DistributableQuery distributableQuery); + Mono getPatientMask(DistributableQuery distributableQuery); - Set patientMaskToPatientIdSet(BigInteger patientMask); + Set patientMaskToPatientIdSet(VariantMask patientMask); - BigInteger createMaskForPatientSet(Set patientSubset); + VariantMask createMaskForPatientSet(Set patientSubset); Mono> getVariantList(DistributableQuery distributableQuery); List getPatientIds(); - Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder); + Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder); Set getInfoStoreColumns(); diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java index 4dce5b37..11013ffd 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java @@ -1,6 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import reactor.core.publisher.Mono; @@ -13,17 +15,17 @@ public class GenomicProcessorNoOp implements GenomicProcessor { @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { + public Mono getPatientMask(DistributableQuery distributableQuery) { return null; } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { + public Set patientMaskToPatientIdSet(VariantMask patientMask) { return null; } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { return null; } @@ -38,7 +40,7 @@ public List getPatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { return Optional.empty(); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java index 9e1c6d7f..a030fde0 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java @@ -3,9 +3,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.Range; import com.google.common.collect.Sets; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Filter; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; @@ -77,10 +75,10 @@ public GenomicProcessorNodeImpl(String genomicDataDirectory) { } @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { + public Mono getPatientMask(DistributableQuery distributableQuery) { return Mono.fromCallable(() -> runGetPatientMask(distributableQuery)).subscribeOn(Schedulers.boundedElastic()); } - public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { + public VariantMask runGetPatientMask(DistributableQuery distributableQuery) { // log.debug("filterdIDSets START size: " + filteredIdSets.size()); /* VARIANT INFO FILTER HANDLING IS MESSY */ if(distributableQuery.hasFilters()) { @@ -103,7 +101,7 @@ public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { // todo: handle empty getVariantInfoFilters() // add filteredIdSet for patients who have matching variants, heterozygous or homozygous for now. - BigInteger patientMask = null; + VariantMask patientMask = null; if (intersectionOfInfoFilters != null ){ patientMask = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(distributableQuery.getPatientIds(), intersectionOfInfoFilters); } else { @@ -111,24 +109,24 @@ public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { } - VariantBucketHolder variantMasksVariantBucketHolder = new VariantBucketHolder<>(); + VariantBucketHolder variantMasksVariantBucketHolder = new VariantBucketHolder<>(); if (!distributableQuery.getRequiredFields().isEmpty() ) { for (String variantSpec : distributableQuery.getRequiredFields()) { - BigInteger patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(new String[]{"0/1", "1/1"}, variantSpec, variantMasksVariantBucketHolder); + VariantMask patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(new String[]{"0/1", "1/1"}, variantSpec, variantMasksVariantBucketHolder); if (patientMask == null) { patientMask = patientsForVariantSpec; } else { - patientMask = patientMask.and(patientsForVariantSpec); + patientMask = patientMask.intersection(patientsForVariantSpec); } } } if (!distributableQuery.getCategoryFilters().isEmpty()) { for (Map.Entry categoryFilterEntry : distributableQuery.getCategoryFilters().entrySet()) { - BigInteger patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(categoryFilterEntry.getValue(), categoryFilterEntry.getKey(), null); + VariantMask patientsForVariantSpec = getIdSetForVariantSpecCategoryFilter(categoryFilterEntry.getValue(), categoryFilterEntry.getKey(), null); if (patientMask == null) { patientMask = patientsForVariantSpec; } else { - patientMask = patientMask.and(patientsForVariantSpec); + patientMask = patientMask.intersection(patientsForVariantSpec); } } } @@ -140,16 +138,8 @@ public BigInteger runGetPatientMask(DistributableQuery distributableQuery) { } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new HashSet<>(); - String bitmaskString = patientMask.toString(2); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = variantService.getPatientIds()[x-2].trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; + public Set patientMaskToPatientIdSet(VariantMask patientMask) { + return patientMask.patientMaskToPatientIdSet(getPatientIds()); } private List getVariantsMatchingFilters(Query.VariantInfoFilter filter) { @@ -212,8 +202,8 @@ private List filterInfoCategoryKeys(String[] values, FileBackedByteIndex } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { - return patientVariantJoinHandler.createMaskForPatientSet(patientSubset); + public VariantMask createMaskForPatientSet(Set patientSubset) { + return new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(patientSubset)); } private FileBackedByteIndexedInfoStore getInfoStore(String column) { @@ -265,24 +255,25 @@ public Collection runGetVariantList(DistributableQuery query) { return unionOfInfoFilters.mapToVariantSpec(variantService.getVariantIndex()); } - BigInteger patientMasks = createMaskForPatientSet(patientSubset); + VariantMask patientMasks = createMaskForPatientSet(patientSubset); Set unionOfInfoFiltersVariantSpecs = unionOfInfoFilters.mapToVariantSpec(variantService.getVariantIndex()); Collection variantsInScope = variantService.filterVariantSetForPatientSet(unionOfInfoFiltersVariantSpecs, new ArrayList<>(patientSubset)); //NC - this is the original variant filtering, which checks the patient mask from each variant against the patient mask from the query if(variantsInScope.size()<100000) { - ConcurrentSkipListSet variantsWithPatients = new ConcurrentSkipListSet(); + ConcurrentSkipListSet variantsWithPatients = new ConcurrentSkipListSet<>(); variantsInScope.parallelStream().forEach(variantKey -> { variantService.getMasks(variantKey, new VariantBucketHolder<>()).ifPresent(masks -> { - if ( masks.heterozygousMask != null && masks.heterozygousMask.and(patientMasks).bitCount()>4) { + // todo: implement for variant explorer + /*if ( masks.heterozygousMask != null && masks.heterozygousMask.intersection(patientMasks).bitCount()>4) { variantsWithPatients.add(variantKey); - } else if ( masks.homozygousMask != null && masks.homozygousMask.and(patientMasks).bitCount()>4) { + } else if ( masks.homozygousMask != null && masks.homozygousMask.intersection(patientMasks).bitCount()>4) { variantsWithPatients.add(variantKey); - } else if ( masks.heterozygousNoCallMask != null && masks.heterozygousNoCallMask.and(patientMasks).bitCount()>4) { + } else if ( masks.heterozygousNoCallMask != null && masks.heterozygousNoCallMask.intersection(patientMasks).bitCount()>4) { //so heterozygous no calls we want, homozygous no calls we don't variantsWithPatients.add(variantKey); - } + }*/ }); }); return variantsWithPatients; @@ -293,34 +284,35 @@ public Collection runGetVariantList(DistributableQuery query) { return new ArrayList<>(); } - private BigInteger getIdSetForVariantSpecCategoryFilter(String[] zygosities, String key, VariantBucketHolder bucketCache) { - List variantBitmasks = getBitmasksForVariantSpecCategoryFilter(zygosities, key, bucketCache); + private VariantMask getIdSetForVariantSpecCategoryFilter(String[] zygosities, String key, VariantBucketHolder bucketCache) { + List variantBitmasks = getBitmasksForVariantSpecCategoryFilter(zygosities, key, bucketCache); Set patientIds = new HashSet<>(); if(!variantBitmasks.isEmpty()) { - BigInteger bitmask = variantBitmasks.get(0); + VariantMask variantMask = variantBitmasks.get(0); if(variantBitmasks.size()>1) { for(int x = 1;x()); + return VariantMask.emptyInstance(); } - private ArrayList getBitmasksForVariantSpecCategoryFilter(String[] zygosities, String variantName, VariantBucketHolder bucketCache) { - ArrayList variantBitmasks = new ArrayList<>(); + private ArrayList getBitmasksForVariantSpecCategoryFilter(String[] zygosities, String variantName, VariantBucketHolder bucketCache) { + ArrayList variantBitmasks = new ArrayList<>(); variantName = variantName.replaceAll(",\\d/\\d$", ""); log.debug("looking up mask for : " + variantName); - Optional optionalMasks = variantService.getMasks(variantName, bucketCache); + Optional optionalMasks = variantService.getMasks(variantName, bucketCache); Arrays.stream(zygosities).forEach((zygosity) -> { optionalMasks.ifPresent(masks -> { if(zygosity.equals(HOMOZYGOUS_REFERENCE)) { - BigInteger homozygousReferenceBitmask = calculateIndiscriminateBitmask(masks); + // todo: implement for VariantMask. I don't think this logic was sound previously + /*VariantMask homozygousReferenceBitmask = calculateIndiscriminateBitmask(masks); for(int x = 2;x getBitmasksForVariantSpecCategoryFilter(String[] z } }); if (optionalMasks.isEmpty()) { - variantBitmasks.add(variantService.emptyBitmask()); + variantBitmasks.add(new VariantMaskBitmaskImpl(variantService.emptyBitmask())); } }); @@ -342,16 +334,16 @@ private ArrayList getBitmasksForVariantSpecCategoryFilter(String[] z * @param masks * @return */ - private BigInteger calculateIndiscriminateBitmask(VariantMasks masks) { - BigInteger indiscriminateVariantBitmask = null; + private VariantMask calculateIndiscriminateBitmask(VariableVariantMasks masks) { + VariantMask indiscriminateVariantBitmask = null; if(masks.heterozygousMask == null && masks.homozygousMask != null) { indiscriminateVariantBitmask = masks.homozygousMask; }else if(masks.homozygousMask == null && masks.heterozygousMask != null) { indiscriminateVariantBitmask = masks.heterozygousMask; }else if(masks.homozygousMask != null && masks.heterozygousMask != null) { - indiscriminateVariantBitmask = masks.heterozygousMask.or(masks.homozygousMask); + indiscriminateVariantBitmask = masks.heterozygousMask.intersection(masks.homozygousMask); }else { - indiscriminateVariantBitmask = variantService.emptyBitmask(); + indiscriminateVariantBitmask = new VariantMaskBitmaskImpl(variantService.emptyBitmask()); } return indiscriminateVariantBitmask; } @@ -362,7 +354,7 @@ public List getPatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { return variantService.getMasks(path, variantMasksVariantBucketHolder); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java index 0806dff8..ea9ea853 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java @@ -3,8 +3,7 @@ import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,28 +47,20 @@ public GenomicProcessorParentImpl(List nodes) { } @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { - Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) + public Mono getPatientMask(DistributableQuery distributableQuery) { + Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) .flatMap(node -> node.getPatientMask(distributableQuery)) - .reduce(BigInteger::or); + .reduce(VariantMask::union); return result; } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new HashSet<>(); - String bitmaskString = patientMask.toString(2); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = getPatientIds().get(x-2).trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; + public Set patientMaskToPatientIdSet(VariantMask patientMask) { + return patientMask.patientMaskToPatientIdSet(getPatientIds()); } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { throw new RuntimeException("Not implemented"); } @@ -111,10 +102,10 @@ private List initializePatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { // TODO: test. only used in variant explorer for (GenomicProcessor node : nodes) { - Optional masks = node.getMasks(path, variantMasksVariantBucketHolder); + Optional masks = node.getMasks(path, variantMasksVariantBucketHolder); if (masks.isPresent()) { return masks; } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java index 35e80ceb..253ee874 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java @@ -4,8 +4,7 @@ import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.ImmutableList; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,38 +45,50 @@ public GenomicProcessorPatientMergingParentImpl(List nodes) { infoColumnsMeta = initInfoColumnsMeta(); } + private class SizedVariantMask { + private final VariantMask variantMask; + private final int size; + + public VariantMask getVariantMask() { + return variantMask; + } + + public int getSize() { + return size; + } + + public SizedVariantMask(VariantMask variantMask, int size) { + this.variantMask = variantMask; + this.size = size; + } + } + @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { - Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) - .flatMapSequential(node -> node.getPatientMask(distributableQuery)) - .reduce(this::appendMask); + public Mono getPatientMask(DistributableQuery distributableQuery) { + Mono result = Flux.just(nodes.toArray(GenomicProcessor[]::new)) + .flatMapSequential(node -> { + return node.getPatientMask(distributableQuery).map(mask -> { + return new SizedVariantMask(mask, node.getPatientIds().size()); + }); + }) + .reduce(this::appendMask) + .map(SizedVariantMask::getVariantMask); return result; } - public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { - String binaryMask1 = mask1.toString(2); - String binaryMask2 = mask2.toString(2); - String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + - binaryMask2.substring(2); - return new BigInteger(appendedString, 2); + /** A little bit of a hack for now since the masks don't have sizes at this point and they are needed to merge + */ + public SizedVariantMask appendMask(SizedVariantMask mask1, SizedVariantMask mask2) { + return new SizedVariantMask(VariableVariantMasks.appendMask(mask1.variantMask, mask2.variantMask, mask1.size, mask2.size), mask1.size + mask2.size); } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new HashSet<>(); - String bitmaskString = patientMask.toString(2); - List patientIds = getPatientIds(); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = patientIds.get(x-2).trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; + public Set patientMaskToPatientIdSet(VariantMask patientMask) { + return patientMask.patientMaskToPatientIdSet(getPatientIds()); } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { throw new RuntimeException("Not implemented"); } @@ -116,7 +127,7 @@ private List initializePatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { // TODO: implement this. only used in variant explorer throw new RuntimeException("Method not implemented"); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java index 58ebc3c5..e5c824fb 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandler.java @@ -3,8 +3,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.collect.Sets; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantSpec; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,7 +24,7 @@ public PatientVariantJoinHandler(VariantService variantService) { this.variantService = variantService; } - public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patientSubset, + public VariantMask getPatientIdsForIntersectionOfVariantSets(Set patientSubset, VariantIndex intersectionOfInfoFilters) { if(!intersectionOfInfoFilters.isEmpty()) { @@ -45,10 +44,10 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient // If genomic data is sharded by studies, it may be possible that the node this is running in does not have genomic // data for any of the patients in the phenotypic query. In which case, we don't have to look for matching patients if (patientsInScope.isEmpty()) { - return variantService.emptyBitmask(); + return new VariantMaskSparseImpl(Set.of()); } - BigInteger[] matchingPatients = new BigInteger[] {variantService.emptyBitmask()}; + VariantMask[] matchingPatients = new VariantMask[] {new VariantMaskSparseImpl(Set.of())}; Set variantsInScope = intersectionOfInfoFilters.mapToVariantSpec(variantService.getVariantIndex()); @@ -71,23 +70,29 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient log.info("and partitioned those into " + variantBucketPartitions.size() + " groups"); int patientsInScopeSize = patientsInScope.size(); - BigInteger patientsInScopeMask = createMaskForPatientSet(patientsInScope); + VariantMask patientsInScopeMask = new VariantMaskBitmaskImpl(createMaskForPatientSet(patientsInScope)); for(int x = 0; - x < variantBucketPartitions.size() && matchingPatients[0].bitCount() < patientsInScopeSize + 4; + x < variantBucketPartitions.size() /*&& matchingPatients[0].bitCount() < patientsInScopeSize + 4*/; x++) { List> variantBuckets = variantBucketPartitions.get(x); variantBuckets.parallelStream().forEach(variantBucket -> { - VariantBucketHolder bucketCache = new VariantBucketHolder<>(); + VariantBucketHolder bucketCache = new VariantBucketHolder<>(); List missingVariants = new ArrayList<>(); variantBucket.forEach(variantSpec -> { - Optional variantMask = variantService.getMasks(variantSpec, bucketCache); + Optional variantMask = variantService.getMasks(variantSpec, bucketCache); variantMask.ifPresentOrElse(masks -> { - BigInteger heteroMask = masks.heterozygousMask == null ? variantService.emptyBitmask() : masks.heterozygousMask; - BigInteger homoMask = masks.homozygousMask == null ? variantService.emptyBitmask() : masks.homozygousMask; - BigInteger orMasks = heteroMask.or(homoMask); - BigInteger andMasks = orMasks.and(patientsInScopeMask); - synchronized(matchingPatients) { - matchingPatients[0] = matchingPatients[0].or(andMasks); + VariantMask heterozygousMask = masks.heterozygousMask; + VariantMask homozygousMask = masks.homozygousMask; + //log.info("Patients with variant " + variantSpec + ": " + (orMasks.bitCount() - 4)); + if (heterozygousMask != null) { + synchronized(matchingPatients) { + matchingPatients[0] = matchingPatients[0].union(heterozygousMask); + } + } + if (homozygousMask != null) { + synchronized(matchingPatients) { + matchingPatients[0] = matchingPatients[0].union(homozygousMask); + } } }, () -> missingVariants.add(variantSpec)); }); @@ -97,17 +102,19 @@ public BigInteger getPatientIdsForIntersectionOfVariantSets(Set patient } }); } - return matchingPatients[0]; + return matchingPatients[0].intersection(patientsInScopeMask); }else { log.error("No matches found for info filters."); - return createMaskForPatientSet(new HashSet<>()); + return new VariantMaskSparseImpl(Set.of()); } } + // todo: return VariantMask public BigInteger createMaskForPatientSet(Set patientSubset) { StringBuilder builder = new StringBuilder("11"); //variant bitmasks are bookended with '11' - for(String patientId : variantService.getPatientIds()) { - Integer idInt = Integer.parseInt(patientId); + + for (int i = variantService.getPatientIds().length - 1; i >= 0; i--) { + Integer idInt = Integer.parseInt(variantService.getPatientIds()[i]); if(patientSubset.contains(idInt)){ builder.append("1"); } else { @@ -116,8 +123,6 @@ public BigInteger createMaskForPatientSet(Set patientSubset) { } builder.append("11"); // masks are bookended with '11' set this so we don't count those -// log.debug("PATIENT MASK: " + builder.toString()); - BigInteger patientMasks = new BigInteger(builder.toString(), 2); return patientMasks; } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java index 18a30531..08d83bc6 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/QueryProcessor.java @@ -4,6 +4,7 @@ import java.util.*; import java.util.stream.Collectors; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -106,7 +107,7 @@ private void processColumn(List paths, TreeSet ids, ResultStore String path = paths.get(x-1); if(VariantUtils.pathIsVariantSpec(path)) { // todo: confirm this entire if block is even used. I don't think it is - Optional masks = abstractProcessor.getMasks(path, new VariantBucketHolder<>()); + Optional masks = abstractProcessor.getMasks(path, new VariantBucketHolder<>()); List patientIds = abstractProcessor.getPatientIds(); int idPointer = 0; @@ -162,7 +163,7 @@ private void writeVariantNullResultField(ResultStore results, Integer x, ByteBuf results.writeField(x,idInSubsetPointer, valueBuffer); } - private int writeVariantResultField(ResultStore results, Integer x, Optional variantMasks, int idPointer, + private int writeVariantResultField(ResultStore results, Integer x, Optional variantMasks, int idPointer, int idInSubsetPointer) { byte[] valueBuffer = variantMasks.map(masks -> { if(masks.heterozygousMask != null && masks.heterozygousMask.testBit(idPointer)) { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java index 925a2394..7689d750 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java @@ -6,12 +6,10 @@ import java.util.*; import java.util.stream.Collectors; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMetadataIndex; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantSpec; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.PhenoCube; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; @@ -185,7 +183,7 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws Set patientSubset = abstractProcessor.getPatientSubsetForQuery(query); log.debug("identified " + patientSubset.size() + " patients from query"); Map patientIndexMap = new LinkedHashMap(); //keep a map for quick index lookups - BigInteger patientMasks = abstractProcessor.createMaskForPatientSet(patientSubset); + VariantMask patientMasks = abstractProcessor.createMaskForPatientSet(patientSubset); int index = 2; //variant bitmasks are bookended with '11' @@ -215,7 +213,7 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws } //End of headers builder.append("\n"); - VariantBucketHolder variantMaskBucketHolder = new VariantBucketHolder(); + VariantBucketHolder variantMaskBucketHolder = new VariantBucketHolder(); //loop over the variants identified, and build an output row metadata.forEach((String variantSpec, String[] variantMetadata)->{ @@ -262,33 +260,22 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws } // todo: deal with empty return - VariantMasks masks = abstractProcessor.getMasks(variantSpec, variantMaskBucketHolder).get(); + VariableVariantMasks masks = abstractProcessor.getMasks(variantSpec, variantMaskBucketHolder).get(); //make strings of 000100 so we can just check 'char at' //so heterozygous no calls we want, homozygous no calls we don't - BigInteger heteroMask = masks.heterozygousMask != null? masks.heterozygousMask : masks.heterozygousNoCallMask != null ? masks.heterozygousNoCallMask : null; - BigInteger homoMask = masks.homozygousMask != null? masks.homozygousMask : null; - - - String heteroMaskString = heteroMask != null ? heteroMask.toString(2) : null; - String homoMaskString = homoMask != null ? homoMask.toString(2) : null; + VariantMask heteroMask = masks.heterozygousMask != null? masks.heterozygousMask : masks.heterozygousNoCallMask != null ? masks.heterozygousNoCallMask : null; + VariantMask homoMask = masks.homozygousMask != null? masks.homozygousMask : null; // Patient count = (hetero mask | homo mask) & patient mask - BigInteger heteroOrHomoMask = orNullableMasks(heteroMask, homoMask); - int patientCount = heteroOrHomoMask == null ? 0 : (heteroOrHomoMask.and(patientMasks).bitCount() - 4); + VariantMask heteroOrHomoMask = orNullableMasks(heteroMask, homoMask); + int patientCount = heteroOrHomoMask == null ? 0 : (heteroOrHomoMask.intersection(patientMasks).bitCount() - 4); int bitCount = masks.heterozygousMask == null? 0 : (masks.heterozygousMask.bitCount() - 4); bitCount += masks.homozygousMask == null? 0 : (masks.homozygousMask.bitCount() - 4); //count how many patients have genomic data available - Integer patientsWithVariantsCount = null; - if(heteroMaskString != null) { - patientsWithVariantsCount = heteroMaskString.length() - 4; - } else if (homoMaskString != null ) { - patientsWithVariantsCount = homoMaskString.length() - 4; - } else { - patientsWithVariantsCount = -1; - } + Integer patientsWithVariantsCount = patientMasks.bitCount() - 4; // (patients with/total) in subset \t (patients with/total) out of subset. @@ -299,15 +286,15 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws StringBuilder patientListBuilder = new StringBuilder(); for(Integer patientIndex : patientIndexMap.values()) { - if(heteroMaskString != null && '1' == heteroMaskString.charAt(patientIndex)) { + if(heteroMask != null && heteroMask.testBit(patientIndex)) { patientListBuilder.append("\t0/1"); - }else if(homoMaskString != null && '1' == homoMaskString.charAt(patientIndex)) { + }else if(homoMask != null && homoMask.testBit(patientIndex)) { patientListBuilder.append("\t1/1"); }else { patientListBuilder.append("\t0/0"); } } - builder.append(patientListBuilder.toString()); + builder.append(patientListBuilder); } builder.append("\n"); @@ -317,10 +304,10 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws return builder.toString(); } - private BigInteger orNullableMasks(BigInteger heteroMask, BigInteger homoMask) { + private VariantMask orNullableMasks(VariantMask heteroMask, VariantMask homoMask) { if (heteroMask != null) { if (homoMask != null) { - return heteroMask.or(homoMask); + return heteroMask.union(homoMask); } return heteroMask; } else { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index 46093b8b..eab9dc44 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.BucketIndexBySample; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; @@ -189,7 +190,7 @@ public String[] getPatientIds() { return variantStore.getPatientIds(); } - public Optional getMasks(String variantName, VariantBucketHolder bucketCache) { + public Optional getMasks(String variantName, VariantBucketHolder bucketCache) { try { return variantStore.getMasks(variantName, bucketCache); } catch (IOException e) { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java index 33c6a54b..de12dea0 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java @@ -1,6 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing.genomic; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.processing.DistributableQuery; @@ -34,23 +36,23 @@ public GenomicProcessorRestClient(String serviceUrl) { } @Override - public Mono getPatientMask(DistributableQuery distributableQuery) { - Mono result = webClient.post() + public Mono getPatientMask(DistributableQuery distributableQuery) { + Mono result = webClient.post() .uri("/patients") .contentType(MediaType.APPLICATION_JSON) .body(Mono.just(distributableQuery), DistributableQuery.class) .retrieve() - .bodyToMono(BigInteger.class); + .bodyToMono(VariantMask.class); return result; } @Override - public Set patientMaskToPatientIdSet(BigInteger patientMask) { + public Set patientMaskToPatientIdSet(VariantMask patientMask) { throw new RuntimeException("Not Implemented"); } @Override - public BigInteger createMaskForPatientSet(Set patientSubset) { + public VariantMask createMaskForPatientSet(Set patientSubset) { throw new RuntimeException("Not Implemented"); } @@ -77,7 +79,7 @@ public List getPatientIds() { } @Override - public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { + public Optional getMasks(String path, VariantBucketHolder variantMasksVariantBucketHolder) { throw new RuntimeException("Not Implemented"); } diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java index 39cb32e9..229a9957 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java @@ -2,6 +2,7 @@ import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -58,8 +59,8 @@ public void getPatientSubsetForQuery_oneVariantCategoryFilter_indexFound() { Query.VariantInfoFilter variantInfoFilter = new Query.VariantInfoFilter(); variantInfoFilter.categoryVariantInfoFilters = categoryVariantInfoFilters; - when(genomicProcessor.getPatientMask(isA(DistributableQuery.class))).thenReturn(Mono.just(new BigInteger("1100110011"))); - when(genomicProcessor.patientMaskToPatientIdSet(eq(new BigInteger("1100110011")))).thenReturn(Set.of(42, 99)); + when(genomicProcessor.getPatientMask(isA(DistributableQuery.class))).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("1100110011")))); + when(genomicProcessor.patientMaskToPatientIdSet(eq(new VariantMaskBitmaskImpl(new BigInteger("1100110011"))))).thenReturn(Set.of(42, 99)); List variantInfoFilters = List.of(variantInfoFilter); diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java index 0ab4a743..77ecff44 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java @@ -1,5 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -50,15 +52,15 @@ public void patientIdInit_patientsDiffer_exception() { @Test public void getPatientMask_validResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110110000011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110001100011", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110000000111", 2))); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110110000011", 2)))); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110001100011", 2)))); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110000000111", 2)))); parentProcessor = new GenomicProcessorParentImpl(List.of( mockProcessor1, mockProcessor2, mockProcessor3 )); - BigInteger patientMask = parentProcessor.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("110111100111", 2); + VariantMask patientMask = parentProcessor.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("110111100111", 2)); assertEquals(expectedPatientMask, patientMask); } } \ No newline at end of file diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java index 06fe6f81..193696fe 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImplTest.java @@ -1,5 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -37,33 +39,42 @@ public void setup() { @Test public void getPatientMask_validResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11011011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110001100011", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11000111", 2))); - BigInteger patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("11011000011000000111", 2); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor1.getPatientIds()).thenReturn(List.of("1", "2", "3", "4")); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110001100011", 2)))); + when(mockProcessor2.getPatientIds()).thenReturn(List.of("5", "6", "7", "8", "9", "10", "11", "12")); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); + when(mockProcessor3.getPatientIds()).thenReturn(List.of("15", "16", "17", "18")); + VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11000100011000011011", 2)); assertEquals(expectedPatientMask, patientMask); } @Test public void getPatientMask_noPatientResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11011011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("110000000011", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11000011", 2))); - BigInteger patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("11011000000000000011", 2); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor1.getPatientIds()).thenReturn(List.of("1", "2", "3", "4")); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("110000000011", 2)))); + when(mockProcessor2.getPatientIds()).thenReturn(List.of("5", "6", "7", "8", "9", "10", "11", "12")); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000011", 2)))); + when(mockProcessor3.getPatientIds()).thenReturn(List.of("15", "16", "17", "18")); + VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("11000000000000011011", 2)); assertEquals(expectedPatientMask, patientMask); } @Test public void getPatientMask_emptyResponses_returnMerged() { DistributableQuery distributableQuery = new DistributableQuery(); - when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11011011", 2))); - when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("1111", 2))); - when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new BigInteger("11000111", 2))); - BigInteger patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); - BigInteger expectedPatientMask = new BigInteger("110110000111", 2); + when(mockProcessor1.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11011011", 2)))); + when(mockProcessor1.getPatientIds()).thenReturn(List.of("1", "2", "3", "4")); + when(mockProcessor2.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("1111", 2)))); + when(mockProcessor2.getPatientIds()).thenReturn(List.of()); + when(mockProcessor3.getPatientMask(distributableQuery)).thenReturn(Mono.just(new VariantMaskBitmaskImpl(new BigInteger("11000111", 2)))); + when(mockProcessor3.getPatientIds()).thenReturn(List.of("5", "6", "7", "8")); + VariantMask patientMask = patientMergingParent.getPatientMask(distributableQuery).block(); + VariantMask expectedPatientMask = new VariantMaskBitmaskImpl(new BigInteger("110001011011", 2)); assertEquals(expectedPatientMask, patientMask); } diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java index c2d5f252..1ab10a7b 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/PatientVariantJoinHandlerTest.java @@ -1,5 +1,8 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMaskBitmaskImpl; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -8,6 +11,7 @@ import org.springframework.test.context.event.annotation.BeforeTestClass; import java.math.BigInteger; +import java.util.List; import java.util.Optional; import java.util.Set; import java.util.TreeSet; @@ -25,7 +29,7 @@ public class PatientVariantJoinHandlerTest { public static final String[] PATIENT_IDS = {"101", "102", "103", "104", "105", "106", "107", "108"}; public static final Set PATIENT_IDS_INTEGERS = Set.of(PATIENT_IDS).stream().map(Integer::parseInt).collect(Collectors.toSet()); - public static final String[] VARIANT_INDEX = {"16,61642243,A,T", "16,61642252,A,G", "16,61642256,C,T", "16,61642257,G,A", "16,61642258,G,A", "16,61642259,G,A", "16,61642260,G,A", "16,61642261,G,A"}; + public static final String[] VARIANT_INDEX = {"16,61642243,A,T,ABC,consequence1", "16,61642252,A,G,ABC,consequence1", "16,61642256,C,T,ABC,consequence2", "16,61642257,G,A,ABC,consequence3", "16,61642258,G,A,ABC,consequence3", "16,61642259,G,A,ABC,consequence1", "16,61642260,G,A,ABC,consequence1", "16,61642261,G,A,ABC,consequence1"}; public PatientVariantJoinHandlerTest(@Mock VariantService variantService) { this.variantService = variantService; @@ -39,18 +43,18 @@ public void getPatientIdsForIntersectionOfVariantSets_allPatientsMatchOneVariant when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForAllPatients = patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS); - BigInteger maskForNoPatients = patientVariantJoinHandler.createMaskForPatientSet(Set.of()); + VariantMask maskForAllPatients = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS)); + VariantMask maskForNoPatients = VariantMask.emptyInstance(); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForAllPatients; - VariantMasks emptyVariantMasks = new VariantMasks(new String[0]); + VariableVariantMasks emptyVariantMasks = new VariableVariantMasks(); emptyVariantMasks.heterozygousMask = maskForNoPatients; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.of(emptyVariantMasks)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be all patients, as all patients match one of the variants assertEquals(PATIENT_IDS_INTEGERS, patientIdsForIntersectionOfVariantSets); } @@ -61,18 +65,18 @@ public void getPatientIdsForIntersectionOfVariantSets_allPatientsMatchOneVariant when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForAllPatients = patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS); - BigInteger maskForNoPatients = patientVariantJoinHandler.createMaskForPatientSet(Set.of()); + VariantMask maskForAllPatients = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(PATIENT_IDS_INTEGERS)); + VariantMask maskForNoPatients = VariantMask.emptyInstance(); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForAllPatients; - VariantMasks emptyVariantMasks = new VariantMasks(new String[0]); + VariableVariantMasks emptyVariantMasks = new VariableVariantMasks(); emptyVariantMasks.heterozygousMask = maskForNoPatients; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.empty()); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.empty()); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be all patients, as all patients match one of the variants assertEquals(PATIENT_IDS_INTEGERS, patientIdsForIntersectionOfVariantSets); } @@ -83,14 +87,14 @@ public void getPatientIdsForIntersectionOfVariantSets_noPatientsMatchVariants() when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForNoPatients = patientVariantJoinHandler.createMaskForPatientSet(Set.of()); - VariantMasks emptyVariantMasks = new VariantMasks(new String[0]); + VariantMask maskForNoPatients = VariantMask.emptyInstance(); + VariableVariantMasks emptyVariantMasks = new VariableVariantMasks(); emptyVariantMasks.heterozygousMask = maskForNoPatients; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(emptyVariantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[4]), any())).thenReturn(Optional.of(emptyVariantMasks)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(), intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(), intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be empty because all variants masks have no matching patients assertEquals(Set.of(), patientIdsForIntersectionOfVariantSets); } @@ -101,17 +105,16 @@ public void getPatientIdsForIntersectionOfVariantSets_somePatientsMatchVariants( when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - - BigInteger maskForPatients1 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103)); - BigInteger maskForPatients2 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105)); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariantMask maskForPatients1 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103))); + VariantMask maskForPatients2 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105))); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForPatients1; - VariantMasks variantMasks2 = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks2 = new VariableVariantMasks(); variantMasks2.heterozygousMask = maskForPatients2; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(variantMasks2)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be all patients who match at least one variant assertEquals(Set.of(101, 103, 105), patientIdsForIntersectionOfVariantSets); } @@ -121,7 +124,7 @@ public void getPatientIdsForIntersectionOfVariantSets_noVariants() { VariantIndex intersectionOfInfoFilters = VariantIndex.empty(); when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(null, intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be empty, as there are no variants assertEquals(Set.of(), patientIdsForIntersectionOfVariantSets); } @@ -132,16 +135,16 @@ public void getPatientIdsForIntersectionOfVariantSets_patientSubsetPassed() { when(variantService.getPatientIds()).thenReturn(PATIENT_IDS); when(variantService.emptyBitmask()).thenReturn(emptyBitmask(PATIENT_IDS)); - BigInteger maskForPatients1 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103, 105)); - BigInteger maskForPatients2 = patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105, 107)); - VariantMasks variantMasks = new VariantMasks(new String[0]); + VariantMask maskForPatients1 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(101, 103, 105))); + VariantMask maskForPatients2 = new VariantMaskBitmaskImpl(patientVariantJoinHandler.createMaskForPatientSet(Set.of(103, 105, 107))); + VariableVariantMasks variantMasks = new VariableVariantMasks(); variantMasks.heterozygousMask = maskForPatients1; - VariantMasks variantMasks2 = new VariantMasks(new String[0]); + VariableVariantMasks variantMasks2 = new VariableVariantMasks(); variantMasks2.heterozygousMask = maskForPatients2; when(variantService.getMasks(eq(VARIANT_INDEX[0]), any())).thenReturn(Optional.of(variantMasks)); when(variantService.getMasks(eq(VARIANT_INDEX[2]), any())).thenReturn(Optional.of(variantMasks2)); - Set patientIdsForIntersectionOfVariantSets = patientMaskToPatientIdSet(patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(102, 103, 104, 105, 106), intersectionOfInfoFilters)); + Set patientIdsForIntersectionOfVariantSets = patientVariantJoinHandler.getPatientIdsForIntersectionOfVariantSets(Set.of(102, 103, 104, 105, 106), intersectionOfInfoFilters).patientMaskToPatientIdSet(List.of(PATIENT_IDS)); // this should be the union of patients matching variants (101, 103, 105, 107), intersected with the patient subset parameter (103, 104, 105) which is (103, 105) assertEquals(Set.of(103, 105), patientIdsForIntersectionOfVariantSets); } @@ -153,16 +156,4 @@ public BigInteger emptyBitmask(String[] patientIds) { } return new BigInteger("11" + emptyVariantMask + "11", 2); } - - public Set patientMaskToPatientIdSet(BigInteger patientMask) { - Set ids = new TreeSet(); - String bitmaskString = patientMask.toString(2); - for(int x = 2;x < bitmaskString.length()-2;x++) { - if('1'==bitmaskString.charAt(x)) { - String patientId = variantService.getPatientIds()[x-2].trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; - } } \ No newline at end of file diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java index 9381ac76..5ded227e 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClientTest.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.processing.genomic; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMask; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; import edu.harvard.hms.dbmi.avillach.hpds.processing.DistributableQuery; import org.junit.jupiter.api.Disabled; @@ -36,7 +37,7 @@ public void simpleTest() { Set patientIds = IntStream.range(53000, 53635).boxed().collect(Collectors.toSet()); distributableQuery.setPatientIds(patientIds); - BigInteger patientMaskForVariantInfoFilters = genomicProcessorRestClient.getPatientMask(distributableQuery).block(); + VariantMask patientMaskForVariantInfoFilters = genomicProcessorRestClient.getPatientMask(distributableQuery).block(); System.out.println(patientMaskForVariantInfoFilters); } diff --git a/service/src/main/resources/application-development.properties b/service/src/main/resources/application-development.properties index 860f5d64..16c335d2 100644 --- a/service/src/main/resources/application-development.properties +++ b/service/src/main/resources/application-development.properties @@ -2,5 +2,5 @@ SMALL_JOB_LIMIT = 100 SMALL_TASK_THREADS = 1 LARGE_TASK_THREADS = 1 -hpds.genomicProcessor.impl=localPatientDistributed +hpds.genomicProcessor.impl=localDistributed HPDS_GENOMIC_DATA_DIRECTORY=/opt/local/hpds/all/ \ No newline at end of file