Skip to content

Commit

Permalink
ALS-6330: Fix variant explorer functionality (#115)
Browse files Browse the repository at this point in the history
* ALS-4643: Add integration tests using 1k genome dataset
* ALS-6564: Allow searching by old variant coords
  • Loading branch information
ramari16 authored Oct 10, 2024
1 parent 3f604b4 commit 89357dc
Show file tree
Hide file tree
Showing 65 changed files with 122,982 additions and 3,163 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/github-actions-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
- name: Test with Maven
run: mvn --update-snapshots test
env:
GITHUB_TOKEN: ${{ github.token }}
GITHUB_TOKEN: ${{ github.token }}
2 changes: 0 additions & 2 deletions client-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>

<dependencies>
Expand Down
2 changes: 2 additions & 0 deletions data/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
<artifactId>data</artifactId>

<name>data</name>
<properties>
</properties>

<dependencies>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,6 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws
});

// For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket.

// todo: implement for variant explorer
int indexOfBucket = Collections.binarySearch(bucketList, bucketKey) + 2; //patientBucketCharMasks has bookend bits
for(int x = 0; x < patientIds.size(); x++) {
if(patientMaskForBucket[0].testBit(x)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;

import lombok.Builder;
import lombok.Value;
import lombok.extern.jackson.Jacksonized;

@Jacksonized
@Value
@Builder
public class InfoColumnMeta {

String key, description;
boolean continuous;
Float min, max;
public record InfoColumnMeta(String key, String description, boolean continuous, Float min, Float max) {

}
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,6 @@ static VariantMask emptyInstance() {
}

Set<Integer> patientMaskToPatientIdSet(List<String> patientIds);

boolean isEmpty();
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.fasterxml.jackson.databind.ser.std.ToStringSerializer;
Expand Down Expand Up @@ -75,6 +76,13 @@ public Set<Integer> patientMaskToPatientIdSet(List<String> patientIds) {
return ids;
}

@Override
@JsonIgnore
public boolean isEmpty() {
// because the bitmasks are padded with 11 on each end
return bitmask.bitCount() <= 4;
}

private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) {
return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.or(this.bitmask));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;

import java.math.BigInteger;
Expand Down Expand Up @@ -59,6 +60,12 @@ public Set<Integer> patientMaskToPatientIdSet(List<String> patientIds) {
.collect(Collectors.toSet());
}

@Override
@JsonIgnore
public boolean isEmpty() {
return this.patientIndexes.isEmpty();
}

private VariantMask union(VariantMaskSparseImpl variantMask) {
HashSet<Integer> union = new HashSet<>(variantMask.patientIndexes);
union.addAll(this.patientIndexes);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;

import java.io.*;
import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import com.google.common.base.Joiner;
import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -21,23 +24,23 @@
* a fast, disk-based backing store.
*/
public class VariantMetadataIndex implements Serializable {
// todo: make this variable
public static String VARIANT_METADATA_BIN_FILE = "/opt/local/hpds/all/VariantMetadata.javabin";
public static final String VARIANT_METADATA_FILENAME = "VariantMetadata.javabin";
public static String VARIANT_METADATA_BIN_FILE = "/opt/local/hpds/all/" + VARIANT_METADATA_FILENAME;

private static final long serialVersionUID = 5917054606643971537L;
private static Logger log = LoggerFactory.getLogger(VariantMetadataIndex.class);

// (String) contig --> (Integer) Bucket --> (String) variant spec --> INFO column data[].
private Map<String, FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> > indexMap = new HashMap<String, FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> >();
private final Map<String, FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> > indexMap = new HashMap<>();

// todo: make this variable
private static String fileStoragePrefix = "/opt/local/hpds/all/VariantMetadataStorage";
public static final String VARIANT_METADATA_STORAGE_FILE_PREFIX = "VariantMetadataStorage";
private static String fileStoragePrefix = "/opt/local/hpds/all/" + VARIANT_METADATA_STORAGE_FILE_PREFIX;

/**
* This map allows us to load millions of variants without re-writing the fbbis each time (which would blow up the disk space).
* We need to remember to flush() between each contig this gets saved to the fbbis array.
*/
private transient Map<String, ConcurrentHashMap<Integer, ConcurrentHashMap<String, String[]>> > loadingMap = new HashMap<String, ConcurrentHashMap<Integer, ConcurrentHashMap<String, String[]>> >();
private transient Map<String, ConcurrentHashMap<Integer, ConcurrentHashMap<String, String[]>> > loadingMap = new HashMap<>();

/**
* This constructor should only be used for testing; we expect the files to be in the default locations in production
Expand All @@ -60,7 +63,7 @@ public VariantMetadataIndex() throws IOException {
* @param variantSpec
* @return
*/
public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<String[]> bucketCache) {
public Set<String> findBySingleVariantSpec(String variantSpec, VariantBucketHolder<String[]> bucketCache) {
try {
String[] segments = variantSpec.split(",");
if (segments.length < 2) {
Expand All @@ -75,7 +78,7 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<
|| chrOffset != bucketCache.lastChunkOffset) {
FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> ContigFbbis = indexMap.get(contig);
if(ContigFbbis == null) {
return new String[0];
return Set.of();
}
bucketCache.lastValue = ContigFbbis.get(chrOffset);
bucketCache.lastContig = contig;
Expand All @@ -85,20 +88,20 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<
if( bucketCache.lastValue != null) {
if(bucketCache.lastValue.get(variantSpec) == null) {
log.warn("No variant data found for spec " + variantSpec);
return new String[0];
return Set.of();
}
return bucketCache.lastValue.get(variantSpec);
return Set.of(bucketCache.lastValue.get(variantSpec));
}
log.warn("No bucket found for spec " + variantSpec + " in bucket " + chrOffset);
return new String[0];
return Set.of();

} catch (UncheckedIOException e) {
log.warn("IOException caught looking up variantSpec : " + variantSpec, e);
return new String[0];
return Set.of();
}
}

public Map<String, String[]> findByMultipleVariantSpec(Collection<String> varientSpecList) {
public Map<String, Set<String>> findByMultipleVariantSpec(Collection<String> varientSpecList) {
// log.debug("SPEC list " + varientSpecList.size() + " :: " + Arrays.deepToString(varientSpecList.toArray()));

VariantBucketHolder<String[]> bucketCache = new VariantBucketHolder<String[]>();
Expand Down Expand Up @@ -161,7 +164,7 @@ public synchronized void flush() throws IOException {
if(contigFbbis == null) {
log.info("creating new file for " + contig);
String filePath = fileStoragePrefix + "_" + contig + ".bin";
contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, (Class<ConcurrentHashMap<String, String[]>>)(Class<?>) ConcurrentHashMap.class, new File(filePath));
contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath));
indexMap.put(contig, contigFbbis);
}

Expand Down Expand Up @@ -196,13 +199,57 @@ public void complete() throws IOException {

public static VariantMetadataIndex createInstance(String metadataIndexPath) {
try(ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(
new FileInputStream(metadataIndexPath)))){
return (VariantMetadataIndex) in.readObject();
new FileInputStream(metadataIndexPath + VARIANT_METADATA_FILENAME)))){
VariantMetadataIndex variantMetadataIndex = (VariantMetadataIndex) in.readObject();
variantMetadataIndex.updateStorageDirectory(new File(metadataIndexPath));
return variantMetadataIndex;
} catch(Exception e) {
// todo: handle exceptions better
log.info("No Metadata Index found at " + metadataIndexPath);
log.debug("Error loading metadata index:", e);
return null;
}
}

public static void merge(VariantMetadataIndex variantMetadataIndex1, VariantMetadataIndex variantMetadataIndex2, String outputDirectory) throws IOException {
VariantMetadataIndex merged = new VariantMetadataIndex(outputDirectory + VARIANT_METADATA_STORAGE_FILE_PREFIX);
if (!variantMetadataIndex1.indexMap.keySet().equals(variantMetadataIndex2.indexMap.keySet())) {
log.warn("Merging incompatible variant indexes. Index1 keys: " + Joiner.on(",").join(variantMetadataIndex1.indexMap.keySet()) + ". Index 2 keys: " + Joiner.on(",").join(variantMetadataIndex2.indexMap.keySet()));
throw new IllegalStateException("Cannot merge variant metadata index with different contig keys");
}
for (String contig : variantMetadataIndex1.indexMap.keySet()) {
String filePath = outputDirectory + VARIANT_METADATA_STORAGE_FILE_PREFIX + "_" + contig + ".bin";
FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> mergedFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath));

// Store the merged result here because FileBackedByteIndexedStorage must be written all at once
Map<Integer, ConcurrentHashMap<String, String[]>> mergedStagedFbbis = new HashMap<>();

FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> fbbis1 = variantMetadataIndex1.indexMap.get(contig);
FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> fbbis2 = variantMetadataIndex2.indexMap.get(contig);

fbbis1.keys().forEach(key -> {
mergedStagedFbbis.put(key, fbbis1.get(key));
});
fbbis2.keys().forEach(key -> {
ConcurrentHashMap<String, String[]> metadataMap = mergedStagedFbbis.get(key);
if (metadataMap == null) {
mergedStagedFbbis.put(key, fbbis2.get(key));
} else {
metadataMap.putAll(fbbis2.get(key));
}
});

mergedStagedFbbis.forEach(mergedFbbis::put);
mergedFbbis.complete();
merged.indexMap.put(contig, mergedFbbis);
}

try(ObjectOutputStream out = new ObjectOutputStream(new GZIPOutputStream(Files.newOutputStream(new File(outputDirectory + VARIANT_METADATA_FILENAME).toPath())))){
out.writeObject(merged);
out.flush();
}
}

public void updateStorageDirectory(File genomicDataDirectory) {
indexMap.values().forEach(value -> value.updateStorageDirectory(genomicDataDirectory));
}
}
Loading

0 comments on commit 89357dc

Please sign in to comment.