From 6287251e3f530c380fab04659137274379f436f6 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 25 May 2023 11:35:44 -0400 Subject: [PATCH 01/39] ALS-4461: Deserialize variant index from disk --- .../hpds/processing/VariantIndexCache.java | 12 ++-- .../hpds/processing/VariantService.java | 59 ++++--------------- 2 files changed, 18 insertions(+), 53 deletions(-) diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java index 2ca6cdfe..09b9dce5 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java @@ -67,20 +67,20 @@ public VariantIndex load(String infoColumn_valueKey) throws IOException { log.debug("Calculating value for cache for key " + infoColumn_valueKey); long time = System.currentTimeMillis(); String[] column_and_value = infoColumn_valueKey.split(COLUMN_AND_KEY_DELIMITER); - String[] variantArray = infoStores.get(column_and_value[0]).getAllValues().get(column_and_value[1]); + String[] variantIndexStringArray = infoStores.get(column_and_value[0]).getAllValues().get(column_and_value[1]); - if ((double)variantArray.length / (double)variantIndex.length < MAX_SPARSE_INDEX_RATIO ) { + if ((double)variantIndexStringArray.length / (double)variantIndex.length < MAX_SPARSE_INDEX_RATIO ) { Set variantIds = new HashSet<>(); - for(String variantSpec : variantArray) { - int variantIndexArrayIndex = Arrays.binarySearch(variantIndex, variantSpec); + for(String variantIndex : variantIndexStringArray) { + int variantIndexArrayIndex = Integer.parseInt(variantIndex); variantIds.add(variantIndexArrayIndex); } return new SparseVariantIndex(variantIds); } else { boolean[] variantIndexArray = new boolean[variantIndex.length]; int x = 0; - for(String variantSpec : variantArray) { - int variantIndexArrayIndex = Arrays.binarySearch(variantIndex, variantSpec); + for(String variantIndex : variantIndexStringArray) { + int variantIndexArrayIndex = Integer.parseInt(variantIndex); // todo: shouldn't this be greater than or equal to 0? 0 is a valid index if (variantIndexArrayIndex > 0) { variantIndexArray[variantIndexArrayIndex] = true; diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index f8d01fdc..67171e4d 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -31,6 +31,7 @@ public class VariantService { private final String VARIANT_INDEX_FBBIS_STORAGE_FILE; private final String VARIANT_INDEX_FBBIS_FILE; private final String BUCKET_INDEX_BY_SAMPLE_FILE; + private final String VARIANT_SPEC_INDEX_FILE; private final VariantStore variantStore; @@ -59,6 +60,7 @@ public VariantService() throws IOException, ClassNotFoundException, InterruptedE VARIANT_INDEX_FBBIS_STORAGE_FILE = genomicDataDirectory + "variantIndex_fbbis_storage.javabin"; VARIANT_INDEX_FBBIS_FILE = genomicDataDirectory + "variantIndex_fbbis.javabin"; BUCKET_INDEX_BY_SAMPLE_FILE = genomicDataDirectory + "BucketIndexBySample.javabin"; + VARIANT_SPEC_INDEX_FILE = genomicDataDirectory + "variantSpecIndex.javabin"; variantStore = VariantStore.deserializeInstance(genomicDataDirectory); try { @@ -75,58 +77,21 @@ public void populateVariantIndex() throws InterruptedException { log.warn("No Genomic Data found. Skipping variant Indexing"); return; } - int[] numVariants = {0}; - HashMap contigMap = new HashMap<>(); - - ExecutorService ex = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); - variantStore.getVariantMaskStorage().entrySet().forEach(entry->{ - ex.submit(()->{ - int numVariantsInContig = 0; - FileBackedByteIndexedStorage> storage = entry.getValue(); - HashMap bucketMap = new HashMap<>(); - log.info("Creating bucketMap for contig " + entry.getKey()); - for(Integer bucket: storage.keys()){ - try { - ConcurrentHashMap bucketStorage = storage.get(bucket); - numVariantsInContig += bucketStorage.size(); - bucketMap.put(bucket, bucketStorage.keySet().toArray(new String[0])); - } catch (IOException e) { - log.error("an error occurred", e); - } - }; - log.info("Completed bucketMap for contig " + entry.getKey()); - String[] variantsInContig = new String[numVariantsInContig]; - int current = 0; - for(String[] bucketList : bucketMap.values()) { - System.arraycopy(bucketList, 0, variantsInContig, current, bucketList.length); - current = current + bucketList.length; - } - bucketMap.clear(); - synchronized(numVariants) { - log.info("Found " + variantsInContig.length + " variants in contig " + entry.getKey() + "."); - contigMap.put(entry.getKey(), variantsInContig); - numVariants[0] += numVariantsInContig; - } - }); - }); - ex.shutdown(); - while(!ex.awaitTermination(10, TimeUnit.SECONDS)) { - Thread.sleep(20000); - log.info("Awaiting completion of variant index"); - } - log.info("Found " + numVariants[0] + " total variants."); - variantIndex = new String[numVariants[0]]; + try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(VARIANT_SPEC_INDEX_FILE)));){ + + List variants = (List) objectInputStream.readObject(); + variantIndex = variants.toArray(new String[0]); - int current = 0; - for(String[] contigList : contigMap.values()) { - System.arraycopy(contigList, 0, variantIndex, current, contigList.length); - current = current + contigList.length; + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); } - contigMap.clear(); - Arrays.sort(variantIndex); log.info("Index created with " + variantIndex.length + " total variants."); } From 3537737a41db497d9002aa4f7b99b56d295fdd3f Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 25 May 2023 15:21:57 -0400 Subject: [PATCH 02/39] ALS-4461: Add variant index builder for VCF loading --- .../hpds/etl/genotype/NewVCFLoader.java | 15 +++++++++- .../etl/genotype/VariantIndexBuilder.java | 30 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantIndexBuilder.java diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 3e5d4ec7..f8b9aac0 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -26,6 +26,8 @@ public class NewVCFLoader { private static File storageDir = null; private static String storageDirStr = "/opt/local/hpds/all"; private static String mergedDirStr = "/opt/local/hpds/merged"; + + private static VariantIndexBuilder variantIndexBuilder = new VariantIndexBuilder(); // DO NOT CHANGE THIS unless you want to reload all the data everywhere. private static int CHUNK_SIZE = 1000; @@ -235,6 +237,8 @@ private static void loadVCFs(File indexFile) throws IOException { } } } + + saveVariantIndex(); } private static String sampleIdsForMask(String[] sampleIds, BigInteger heterozygousMask) { @@ -368,6 +372,14 @@ private static void shutdownChunkWriteExecutor() { } } + private static void saveVariantIndex() throws IOException { + try (FileOutputStream fos = new FileOutputStream(new File(storageDir, "variantSpecIndex.javabin")); + GZIPOutputStream gzos = new GZIPOutputStream(fos); + ObjectOutputStream oos = new ObjectOutputStream(gzos);) { + oos.writeObject(variantIndexBuilder.getVariantSpecIndex()); + } + } + private static ConcurrentHashMap convertLoadingMapToMaskMap( HashMap zygosityMaskStrings_f) { ConcurrentHashMap maskMap = new ConcurrentHashMap<>(); @@ -459,8 +471,9 @@ public void updateRecords(char[][] zygosityMaskStrings, ConcurrentHashMap { - infoStore.processRecord(currentSpecNotation, infoColumns); + infoStore.processRecord(Integer.toString(variantIndex), infoColumns); }); } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantIndexBuilder.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantIndexBuilder.java new file mode 100644 index 00000000..71fec959 --- /dev/null +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantIndexBuilder.java @@ -0,0 +1,30 @@ +package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.LinkedList; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +public class VariantIndexBuilder { + + private static Logger logger = LoggerFactory.getLogger(VariantIndexBuilder.class); + + private final LinkedList variantSpecIndex = new LinkedList<>(); + private final Map variantSpecToIndexMap = new ConcurrentHashMap<>(); + + public synchronized Integer getIndex(String variantSpec) { + Integer variantIndex = variantSpecToIndexMap.get(variantSpec); + if (variantIndex == null) { + variantIndex = variantSpecIndex.size(); + variantSpecIndex.add(variantSpec); + variantSpecToIndexMap.put(variantSpec, variantIndex); + } + return variantIndex; + } + + public LinkedList getVariantSpecIndex() { + return variantSpecIndex; + } +} From 620a5af556c3b099ffab5bffdae70f2fb490e20e Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 31 May 2023 13:57:34 -0400 Subject: [PATCH 03/39] ALS-4461: Upgrade major version --- client-api/pom.xml | 4 ++-- common/pom.xml | 2 +- data/pom.xml | 2 +- docker/pom.xml | 2 +- etl/pom.xml | 2 +- pom.xml | 2 +- processing/pom.xml | 2 +- service/pom.xml | 2 +- war/pom.xml | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/client-api/pom.xml b/client-api/pom.xml index ef02dcbb..09f5c552 100644 --- a/client-api/pom.xml +++ b/client-api/pom.xml @@ -4,12 +4,12 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT edu.harvard.hms.dbmi.avillach.hpds client-api - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT client-api diff --git a/common/pom.xml b/common/pom.xml index 206c8a84..d39fcb4d 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -6,7 +6,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT common diff --git a/data/pom.xml b/data/pom.xml index 298160fc..81366252 100644 --- a/data/pom.xml +++ b/data/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT data diff --git a/docker/pom.xml b/docker/pom.xml index d19a2729..7f27d595 100644 --- a/docker/pom.xml +++ b/docker/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT docker diff --git a/etl/pom.xml b/etl/pom.xml index ded9fdd5..77ccc3a7 100644 --- a/etl/pom.xml +++ b/etl/pom.xml @@ -6,7 +6,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT etl diff --git a/pom.xml b/pom.xml index ab958be4..8073b26d 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 edu.harvard.hms.dbmi.avillach.hpds pic-sure-hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT pom pic-sure-hpds diff --git a/processing/pom.xml b/processing/pom.xml index e1da83c2..138633b2 100644 --- a/processing/pom.xml +++ b/processing/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT processing diff --git a/service/pom.xml b/service/pom.xml index 734e3a94..234c131d 100644 --- a/service/pom.xml +++ b/service/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT service diff --git a/war/pom.xml b/war/pom.xml index e71e6efa..cf244453 100644 --- a/war/pom.xml +++ b/war/pom.xml @@ -6,7 +6,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 1.0-SNAPSHOT + 3.0.0-SNAPSHOT hpds-war war From c0ad4a4d74dea6eeb3b93922d6550b904b4dbc94 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 31 May 2023 14:00:30 -0400 Subject: [PATCH 04/39] ALS-4461: Upgrade major version --- client-api/pom.xml | 4 ++-- common/pom.xml | 2 +- data/pom.xml | 2 +- docker/pom.xml | 2 +- etl/pom.xml | 2 +- pom.xml | 2 +- processing/pom.xml | 2 +- service/pom.xml | 2 +- war/pom.xml | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/client-api/pom.xml b/client-api/pom.xml index 09f5c552..3746ec8e 100644 --- a/client-api/pom.xml +++ b/client-api/pom.xml @@ -4,12 +4,12 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT edu.harvard.hms.dbmi.avillach.hpds client-api - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT client-api diff --git a/common/pom.xml b/common/pom.xml index d39fcb4d..9304230e 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -6,7 +6,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT common diff --git a/data/pom.xml b/data/pom.xml index 81366252..c2b27598 100644 --- a/data/pom.xml +++ b/data/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT data diff --git a/docker/pom.xml b/docker/pom.xml index 7f27d595..8322b603 100644 --- a/docker/pom.xml +++ b/docker/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT docker diff --git a/etl/pom.xml b/etl/pom.xml index 77ccc3a7..0b69e8fa 100644 --- a/etl/pom.xml +++ b/etl/pom.xml @@ -6,7 +6,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT etl diff --git a/pom.xml b/pom.xml index 8073b26d..df3ca3fb 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 edu.harvard.hms.dbmi.avillach.hpds pic-sure-hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT pom pic-sure-hpds diff --git a/processing/pom.xml b/processing/pom.xml index 138633b2..f989a9d9 100644 --- a/processing/pom.xml +++ b/processing/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT processing diff --git a/service/pom.xml b/service/pom.xml index 234c131d..728b3af3 100644 --- a/service/pom.xml +++ b/service/pom.xml @@ -5,7 +5,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT service diff --git a/war/pom.xml b/war/pom.xml index cf244453..8decd8b5 100644 --- a/war/pom.xml +++ b/war/pom.xml @@ -6,7 +6,7 @@ pic-sure-hpds edu.harvard.hms.dbmi.avillach.hpds - 3.0.0-SNAPSHOT + 2.0.0-SNAPSHOT hpds-war war From 5afa1feeda968acabb7d965a46756aef65fe62b8 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 15 Jun 2023 13:30:47 -0400 Subject: [PATCH 05/39] ALS-4461: Store variants by index instead of full variant spec. Refactoring to support incremental vcf loading --- common/pom.xml | 8 ++ .../storage/FileBackedByteIndexedStorage.java | 80 +++-------------- .../storage/FileBackedJavaIndexedStorage.java | 74 +++++++++++++++ .../storage/FileBackedJsonIndexStorage.java | 89 +++++++++++++++++++ .../data/genotype/BucketIndexBySample.java | 3 +- .../FileBackedByteIndexedInfoStore.java | 3 +- .../hpds/data/genotype/VariantMasks.java | 15 ++++ .../data/genotype/VariantMetadataIndex.java | 3 +- .../hpds/data/genotype/VariantStore.java | 32 +++---- .../FileBackedStorageVariantMasksImpl.java | 25 ++++++ .../hpds/etl/genotype/NewVCFLoader.java | 14 +-- .../hpds/processing/VariantService.java | 23 ++--- 12 files changed, 265 insertions(+), 104 deletions(-) create mode 100644 common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java create mode 100644 common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java create mode 100644 data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java diff --git a/common/pom.xml b/common/pom.xml index 9304230e..a92292d4 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -21,5 +21,13 @@ com.google.guava guava + + org.codehaus.jackson + jackson-core-asl + + + org.codehaus.jackson + jackson-mapper-asl + diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java index 80db84d8..0de983d4 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java @@ -5,24 +5,24 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; import java.io.RandomAccessFile; import java.io.Serializable; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; import org.apache.commons.io.output.ByteArrayOutputStream; +import org.codehaus.jackson.map.ObjectMapper; -public class FileBackedByteIndexedStorage implements Serializable { +public abstract class FileBackedByteIndexedStorage implements Serializable { private static final long serialVersionUID = -7297090745384302635L; - private transient RandomAccessFile storage; - private ConcurrentHashMap index; - private File storageFile; - private boolean completed = false; - private Long maxStorageSize; //leave this in to not break serialization + protected transient RandomAccessFile storage; + protected ConcurrentHashMap index; + protected File storageFile; + protected boolean completed = false; + protected Long maxStorageSize; //leave this in to not break serialization + public FileBackedByteIndexedStorage(Class keyClass, Class valueClass, File storageFile) throws FileNotFoundException { this.index = new ConcurrentHashMap(); @@ -34,13 +34,7 @@ public Set keys(){ return index.keySet(); } - public void put(K key, V value) throws IOException { - if(completed) { - throw new RuntimeException("A completed FileBackedByteIndexedStorage cannot be modified."); - } - Long[] recordIndex = store(value); - index.put(key, recordIndex); - } + public abstract void put(K key, V value) throws IOException; public void load(Iterable values, Function mapper) throws IOException { //make sure we start fresh @@ -66,57 +60,11 @@ public void complete() { public boolean isComplete() { return this.completed; } - - private Long[] store(V value) throws IOException { - - ByteArrayOutputStream out = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(out)); - oos.writeObject(value); - oos.flush(); - oos.close(); - - Long[] recordIndex = new Long[2]; - synchronized(storage) { - storage.seek(storage.length()); - recordIndex[0] = storage.getFilePointer(); - storage.write(out.toByteArray()); - recordIndex[1] = storage.getFilePointer() - recordIndex[0]; -// maxStorageSize = storage.getFilePointer(); - } - return recordIndex; - } + public abstract V get(K key) throws IOException; - public V get(K key) throws IOException { - if(this.storage==null) { - synchronized(this) { - this.open(); - } - } - Long[] offsetsInStorage = index.get(key); - if(offsetsInStorage != null) { - Long offsetInStorage = index.get(key)[0]; - int offsetLength = index.get(key)[1].intValue(); - if(offsetInStorage != null && offsetLength>0) { - byte[] buffer = new byte[offsetLength]; - synchronized(storage) { - storage.seek(offsetInStorage); - storage.readFully(buffer); - } - ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new ByteArrayInputStream(buffer))); - - try { - V readObject = (V) in.readObject(); - return readObject; - } catch (ClassNotFoundException e) { - throw new RuntimeException("This should never happen."); - } finally { - in.close(); - } - }else { - return null; - } - } else { - return null; - } + public V getOrELse(K key, V defaultValue) throws IOException { + V result = get(key); + return result == null ? defaultValue : result; } + } diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java new file mode 100644 index 00000000..afedbbfa --- /dev/null +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java @@ -0,0 +1,74 @@ +package edu.harvard.hms.dbmi.avillach.hpds.storage; + +import org.apache.commons.io.output.ByteArrayOutputStream; + +import java.io.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public class FileBackedJavaIndexedStorage extends FileBackedByteIndexedStorage { + public FileBackedJavaIndexedStorage(Class keyClass, Class valueClass, File storageFile) throws FileNotFoundException { + super(keyClass, valueClass, storageFile); + } + + public void put(K key, V value) throws IOException { + if(completed) { + throw new RuntimeException("A completed FileBackedByteIndexedStorage cannot be modified."); + } + Long[] recordIndex = store(value); + index.put(key, recordIndex); + } + + private Long[] store(V value) throws IOException { + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(out)); + oos.writeObject(value); + oos.flush(); + oos.close(); + + Long[] recordIndex = new Long[2]; + synchronized(storage) { + storage.seek(storage.length()); + recordIndex[0] = storage.getFilePointer(); + storage.write(out.toByteArray()); + recordIndex[1] = storage.getFilePointer() - recordIndex[0]; +// maxStorageSize = storage.getFilePointer(); + } + return recordIndex; + } + + public V get(K key) throws IOException { + if(this.storage==null) { + synchronized(this) { + this.open(); + } + } + Long[] offsetsInStorage = index.get(key); + if(offsetsInStorage != null) { + Long offsetInStorage = index.get(key)[0]; + int offsetLength = index.get(key)[1].intValue(); + if(offsetInStorage != null && offsetLength>0) { + byte[] buffer = new byte[offsetLength]; + synchronized(storage) { + storage.seek(offsetInStorage); + storage.readFully(buffer); + } + ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new ByteArrayInputStream(buffer))); + + try { + V readObject = (V) in.readObject(); + return readObject; + } catch (ClassNotFoundException e) { + throw new RuntimeException("This should never happen."); + } finally { + in.close(); + } + }else { + return null; + } + } else { + return null; + } + } +} diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java new file mode 100644 index 00000000..c4d751a1 --- /dev/null +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java @@ -0,0 +1,89 @@ +package edu.harvard.hms.dbmi.avillach.hpds.storage; + +import org.apache.commons.io.output.ByteArrayOutputStream; +import org.codehaus.jackson.map.ObjectMapper; +import org.codehaus.jackson.type.TypeReference; + +import java.io.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public abstract class FileBackedJsonIndexStorage extends FileBackedByteIndexedStorage { + private static final long serialVersionUID = -1086729119489479152L; + + protected transient ObjectMapper objectMapper = new ObjectMapper(); + + public FileBackedJsonIndexStorage(File storageFile) throws FileNotFoundException { + super(null, null, storageFile); + } + + public void put(K key, V value) throws IOException { + if(completed) { + throw new RuntimeException("A completed FileBackedByteIndexedStorage cannot be modified."); + } + Long[] recordIndex = store(value); + index.put(key, recordIndex); + } + + private Long[] store(V value) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + objectMapper.writeValue(new GZIPOutputStream(out), value); + + Long[] recordIndex = new Long[2]; + synchronized(storage) { + storage.seek(storage.length()); + recordIndex[0] = storage.getFilePointer(); + storage.write(out.toByteArray()); + recordIndex[1] = storage.getFilePointer() - recordIndex[0]; +// maxStorageSize = storage.getFilePointer(); + } + return recordIndex; + } + + public V get(K key) throws IOException { + if(this.storage==null) { + synchronized(this) { + this.open(); + } + } + Long[] offsetsInStorage = index.get(key); + if(offsetsInStorage != null) { + Long offsetInStorage = index.get(key)[0]; + int offsetLength = index.get(key)[1].intValue(); + if(offsetInStorage != null && offsetLength>0) { + byte[] buffer = new byte[offsetLength]; + synchronized(storage) { + storage.seek(offsetInStorage); + storage.readFully(buffer); + } + try { + V readObject = readObject(buffer); + return readObject; + } catch (Exception e) { + System.out.println("Unable to deserialize, " + e.getMessage()); + System.out.println(new String(buffer)); + throw new RuntimeException(e); + } + }else { + return null; + } + } else { + return null; + } + } + + protected V readObject(byte[] buffer) { + try { + return objectMapper.readValue(new GZIPInputStream(new ByteArrayInputStream(buffer)), getTypeReference()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public abstract TypeReference getTypeReference(); + + private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { + in.defaultReadObject(); + objectMapper = new ObjectMapper(); + } +} diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 55d2422f..5aa5a0c7 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -6,6 +6,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -121,7 +122,7 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws }); // populate patientBucketMasks with bucketMasks for each patient - patientBucketMasks = new FileBackedByteIndexedStorage(Integer.class, BigInteger.class, new File(storageFileStr)); + patientBucketMasks = new FileBackedJavaIndexedStorage(Integer.class, BigInteger.class, new File(storageFileStr)); //the process to write out the bucket masks takes a very long time. //Lets spin up another thread that occasionally logs progress diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java index f282707b..a92f4aa2 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java @@ -12,6 +12,7 @@ import java.util.stream.Collectors; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage; public class FileBackedByteIndexedInfoStore implements Serializable { @@ -51,7 +52,7 @@ public void complete() { } public FileBackedByteIndexedInfoStore(File storageFolder, InfoStore infoStore) throws IOException { - this.allValues = new FileBackedByteIndexedStorage(String.class, String[].class, + this.allValues = new FileBackedJavaIndexedStorage<>(String.class, String[].class, new File(storageFolder, infoStore.column_key + "_infoStoreStorage.javabin")); this.description = infoStore.description; this.column_key = infoStore.column_key; diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasks.java index 9d3e599a..a8e2b75d 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasks.java @@ -1,5 +1,9 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; +import org.codehaus.jackson.annotate.JsonProperty; +import org.codehaus.jackson.map.annotate.JsonSerialize; +import org.codehaus.jackson.map.ser.ToStringSerializer; + import java.io.Serializable; import java.math.BigInteger; @@ -166,8 +170,19 @@ public VariantMasks(char[][] maskValues) { } + public VariantMasks() { + } + + @JsonProperty("ho") + @JsonSerialize(using = ToStringSerializer.class, include=JsonSerialize.Inclusion.NON_NULL) public BigInteger homozygousMask; + @JsonProperty("he") + @JsonSerialize(using = ToStringSerializer.class, include=JsonSerialize.Inclusion.NON_NULL) public BigInteger heterozygousMask; + @JsonProperty("hon") + @JsonSerialize(using = ToStringSerializer.class, include=JsonSerialize.Inclusion.NON_NULL) public BigInteger homozygousNoCallMask; + @JsonProperty("hen") + @JsonSerialize(using = ToStringSerializer.class, include=JsonSerialize.Inclusion.NON_NULL) public BigInteger heterozygousNoCallMask; } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java index 3b706cbc..27a1f2f3 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java @@ -6,6 +6,7 @@ import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -161,7 +162,7 @@ public synchronized void flush() throws IOException { if(contigFbbis == null) { log.info("creating new file for " + contig); String filePath = fileStoragePrefix + "_" + contig + ".bin"; - contigFbbis = new FileBackedByteIndexedStorage>(Integer.class, (Class>)(Class) ConcurrentHashMap.class, new File(filePath)); + contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, (Class>)(Class) ConcurrentHashMap.class, new File(filePath)); indexMap.put(contig, contigFbbis); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index d3758401..880eca9a 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -1,25 +1,17 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; +import com.google.common.collect.RangeSet; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.*; import java.math.BigInteger; import java.util.*; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -import com.google.errorprone.annotations.Var; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.slf4j.event.Level; - -import com.google.common.collect.RangeSet; - -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; -import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; public class VariantStore implements Serializable { private static final long serialVersionUID = -6970128712587609414L; @@ -33,13 +25,13 @@ public class VariantStore implements Serializable { private String[] vcfHeaders = new String[24]; - private TreeMap>> variantMaskStorage = new TreeMap<>(); + private TreeMap>> variantMaskStorage = new TreeMap<>(); - public TreeMap>> getVariantMaskStorage() { + public TreeMap>> getVariantMaskStorage() { return variantMaskStorage; } - public void setVariantMaskStorage(TreeMap>> variantMaskStorage) { + public void setVariantMaskStorage(TreeMap>> variantMaskStorage) { this.variantMaskStorage = variantMaskStorage; } @@ -61,7 +53,7 @@ public static VariantStore deserializeInstance(String genomicDataDirectory) thro public ArrayList listVariants() { ArrayList allVariants = new ArrayList<>(); for (String key : variantMaskStorage.keySet()) { - FileBackedByteIndexedStorage> storage = variantMaskStorage + FileBackedJsonIndexStorage> storage = variantMaskStorage .get(key); storage.keys().stream().forEach((Integer bucket) -> { try { @@ -82,7 +74,7 @@ public Map countVariants() { TreeMap counts = new TreeMap<>(); for (String contig : variantMaskStorage.keySet()) { counts.put(contig, new int[5]); - FileBackedByteIndexedStorage> storage = variantMaskStorage + FileBackedJsonIndexStorage> storage = variantMaskStorage .get(contig); storage.keys().stream().forEach((Integer key) -> { int[] contigCounts = counts.get(contig); @@ -181,7 +173,7 @@ public void setVariantStorageSize(int variantStorageSize) { public List getMasksForRangesOfChromosome(String contigForGene, List offsetsForGene, RangeSet rangeSetsForGene) throws IOException { - FileBackedByteIndexedStorage masksForChromosome = variantMaskStorage.get(contigForGene); + FileBackedJsonIndexStorage masksForChromosome = variantMaskStorage.get(contigForGene); Set bucketsForGene = offsetsForGene.stream().map((offset) -> { return offset / BUCKET_SIZE; }).collect(Collectors.toSet()); diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java new file mode 100644 index 00000000..6d39d79a --- /dev/null +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java @@ -0,0 +1,25 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.storage; + +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; +import org.codehaus.jackson.type.TypeReference; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.Serializable; +import java.util.concurrent.ConcurrentHashMap; + +public class FileBackedStorageVariantMasksImpl extends FileBackedJsonIndexStorage> implements Serializable { + private static final long serialVersionUID = -1086729119489479152L; + + public FileBackedStorageVariantMasksImpl(File storageFile) throws FileNotFoundException { + super(storageFile); + } + private static final TypeReference> typeRef + = new TypeReference>() {}; + + @Override + public TypeReference> getTypeReference() { + return typeRef; + } +} diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index f8b9aac0..f85a4eb3 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -8,9 +8,13 @@ import java.util.stream.Collectors; import java.util.zip.GZIPOutputStream; +import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantMasksImpl; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; +import org.codehaus.jackson.type.TypeReference; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -58,7 +62,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException private static HashMap zygosityMaskStrings; - private static TreeMap>> variantMaskStorage = new TreeMap<>(); + private static TreeMap>> variantMaskStorage = new TreeMap<>(); private static long startTime; @@ -283,7 +287,7 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } if (!currentContig.contentEquals(lastContigProcessed) || currentChunk > lastChunkProcessed || isLastChunk) { // flip chunk - TreeMap>> variantMaskStorage_f = variantMaskStorage; + TreeMap>> variantMaskStorage_f = variantMaskStorage; HashMap zygosityMaskStrings_f = zygosityMaskStrings; String lastContigProcessed_f = lastContigProcessed; int lastChunkProcessed_f = lastChunkProcessed; @@ -294,9 +298,9 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed if ("chr".startsWith(fileName)) { fileName = "chr" + fileName; } + variantMaskStorage_f.put(lastContigProcessed_f, - new FileBackedByteIndexedStorage(Integer.class, ConcurrentHashMap.class, - new File(storageDir, fileName))); + new FileBackedStorageVariantMasksImpl(new File(storageDir, fileName))); } variantMaskStorage_f.get(lastContigProcessed_f).put(lastChunkProcessed_f, convertLoadingMapToMaskMap(zygosityMaskStrings_f)); @@ -313,7 +317,7 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } private static void saveVariantStore(VariantStore store, - TreeMap>> variantMaskStorage) + TreeMap>> variantMaskStorage) throws IOException, FileNotFoundException { store.setVariantMaskStorage(variantMaskStorage); for (FileBackedByteIndexedStorage> storage : variantMaskStorage diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index 67171e4d..dbf9af30 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -5,6 +5,7 @@ import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.caching.VariantBucketHolder; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Service; @@ -12,7 +13,6 @@ import java.io.*; import java.math.BigInteger; import java.util.*; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; @@ -70,19 +70,24 @@ public VariantService() throws IOException, ClassNotFoundException, InterruptedE } } - public void populateVariantIndex() throws InterruptedException { + public String[] loadVariantIndex() { //skip if we have no variants if(variantStore.getPatientIds().length == 0) { - variantIndex = new String[0]; log.warn("No Genomic Data found. Skipping variant Indexing"); - return; + return new String[0]; } + String[] variantIndex = loadVariantIndexFromFile(VARIANT_SPEC_INDEX_FILE); - try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(VARIANT_SPEC_INDEX_FILE)));){ + log.info("Index created with " + variantIndex.length + " total variants."); + return variantIndex; + } + + public static String[] loadVariantIndexFromFile(String variantSpecIndexFile) { + try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(variantSpecIndexFile)));){ List variants = (List) objectInputStream.readObject(); - variantIndex = variants.toArray(new String[0]); + return variants.toArray(new String[0]); } catch (FileNotFoundException e) { throw new RuntimeException(e); @@ -91,8 +96,6 @@ public void populateVariantIndex() throws InterruptedException { } catch (ClassNotFoundException e) { throw new RuntimeException(e); } - - log.info("Index created with " + variantIndex.length + " total variants."); } /** @@ -106,9 +109,9 @@ private void loadGenomicCacheFiles() throws FileNotFoundException, IOException, if(variantIndex==null) { if(!new File(VARIANT_INDEX_FBBIS_FILE).exists()) { log.info("Creating new " + VARIANT_INDEX_FBBIS_FILE); - populateVariantIndex(); + this.variantIndex = loadVariantIndex(); FileBackedByteIndexedStorage fbbis = - new FileBackedByteIndexedStorage(Integer.class, String[].class, new File(VARIANT_INDEX_FBBIS_STORAGE_FILE)); + new FileBackedJavaIndexedStorage<>(Integer.class, String[].class, new File(VARIANT_INDEX_FBBIS_STORAGE_FILE)); try (ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(VARIANT_INDEX_FBBIS_FILE))); ){ From dbf3a2f4206acca53e21e01cd4c5fa669c5f851d Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 15 Jun 2023 15:52:34 -0400 Subject: [PATCH 06/39] ALS-4461: Initial commit for genomic dataset merger --- .../hpds/data/genotype/VariantStore.java | 6 +- .../etl/genotype/GenomicDatasetMerger.java | 291 ++++++++++++++++++ 2 files changed, 294 insertions(+), 3 deletions(-) create mode 100644 etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index 880eca9a..0365c429 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -25,13 +25,13 @@ public class VariantStore implements Serializable { private String[] vcfHeaders = new String[24]; - private TreeMap>> variantMaskStorage = new TreeMap<>(); + private Map>> variantMaskStorage = new TreeMap<>(); - public TreeMap>> getVariantMaskStorage() { + public Map>> getVariantMaskStorage() { return variantMaskStorage; } - public void setVariantMaskStorage(TreeMap>> variantMaskStorage) { + public void setVariantMaskStorage(Map>> variantMaskStorage) { this.variantMaskStorage = variantMaskStorage; } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java new file mode 100644 index 00000000..48f6a80d --- /dev/null +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -0,0 +1,291 @@ +package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; + +import com.google.common.collect.Sets; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; +import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantMasksImpl; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.math.BigInteger; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public class GenomicDatasetMerger { + + private static Logger log = LoggerFactory.getLogger(GenomicDatasetMerger.class); + + private final VariantStore variantStore1; + private final VariantStore variantStore2; + + private final String genomicDirectory1; + private final String genomicDirectory2; + + private final String outputDirectory; + + public GenomicDatasetMerger(String genomicDirectory1, String genomicDirectory2, String outputDirectory) throws IOException, ClassNotFoundException, InterruptedException { + this.genomicDirectory1 = genomicDirectory1; + this.genomicDirectory2 = genomicDirectory2; + this.variantStore1 = VariantStore.deserializeInstance(genomicDirectory1); + this.variantStore2 = VariantStore.deserializeInstance(genomicDirectory2); + + validate(); + this.outputDirectory = outputDirectory; + } + + private void validate() { + if (!variantStore1.getVariantMaskStorage().keySet().equals(variantStore2.getVariantMaskStorage().keySet())) { + log.error("Variant store chromosomes do not match:"); + log.error(String.join(", ", variantStore1.getVariantMaskStorage().keySet())); + log.error(String.join(", ", variantStore2.getVariantMaskStorage().keySet())); + throw new IllegalStateException("Unable to merge variant stores with different numbers of chromosomes"); + } + } + + public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { + + } + + public void mergeVariantStore(Map>> mergedChromosomeMasks) { + VariantStore mergedVariantStore = new VariantStore(); + mergedVariantStore.setVariantMaskStorage(mergedChromosomeMasks); + mergedVariantStore.setPatientIds(mergePatientIds()); + // todo: duplicated from NewVCFLoader, refactor to common location + try (FileOutputStream fos = new FileOutputStream(new File(outputDirectory, "variantStore.javabin")); + GZIPOutputStream gzos = new GZIPOutputStream(fos); + ObjectOutputStream oos = new ObjectOutputStream(gzos);) { + oos.writeObject(mergedVariantStore); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public Map mergeVariantIndexes() throws IOException { + String[] variantIndex1 = loadVariantIndexFromFile(genomicDirectory1 + "variantSpecIndex.javabin"); + String[] variantIndex2 = loadVariantIndexFromFile(genomicDirectory2 + "variantSpecIndex.javabin"); + + Map variantSpecToIndexMap = new HashMap<>(); + LinkedList variantSpecList = new LinkedList<>(Arrays.asList(variantIndex1)); + for (int i = 0; i < variantIndex1.length; i++) { + variantSpecToIndexMap.put(variantIndex1[i], i); + } + + // Will contain any re-mapped indexes in the second variant index. For example, if a variant is contained in both + // data sets, the merged data set will use the index from dataset 1 to reference it, and any references in data + // set 2 for this variant needs to be re-mapped. Likewise, if a variant in set 2 is new, it will be appended to + // the list and also need to be re-mapped + Integer[] remappedIndexes = new Integer[variantIndex2.length]; + + for (int i = 0; i < variantIndex2.length; i++) { + String variantSpec = variantIndex2[i]; + Integer variantIndex = variantSpecToIndexMap.get(variantSpec); + if (variantIndex != null) { + remappedIndexes[i] = variantIndex; + } else { + variantSpecList.add(variantSpec); + // the new index is the now last item in the list + int newVariantSpecIndex = variantSpecList.size() - 1; + remappedIndexes[i] = newVariantSpecIndex; + variantSpecToIndexMap.put(variantSpec, newVariantSpecIndex); + } + } + + Map infoStores1 = loadInfoStores(genomicDirectory1); + Map infoStores2 = loadInfoStores(genomicDirectory2); + Map mergedInfoStores = new HashMap<>(); + + if (!infoStores1.keySet().equals(infoStores2.keySet())) { + throw new IllegalStateException("Info stores do not match"); + } + for (Map.Entry infoStores1Entry : infoStores1.entrySet()) { + FileBackedByteIndexedInfoStore infoStore2 = infoStores2.get(infoStores1Entry.getKey()); + + FileBackedByteIndexedStorage allValuesStore1 = infoStores1Entry.getValue().getAllValues(); + FileBackedByteIndexedStorage allValuesStore2 = infoStore2.getAllValues(); + //FileBackedByteIndexedStorage mergedIndexedStorage = new FileBackedJavaIndexedStorage<>(String.class, String[].class, new File(outputDirectory)); + ConcurrentHashMap> mergedInfoStoreValues = new ConcurrentHashMap<>(); + + Sets.SetView allKeys = Sets.intersection(allValuesStore1.keys(), allValuesStore2.keys()); + for (String key : allKeys) { + Set store1Values = new HashSet<>(Arrays.asList(allValuesStore1.getOrELse(key, new String[]{}))); + Set store2Values = new HashSet<>(Arrays.asList(allValuesStore2.getOrELse(key, new String[]{}))); + Set remappedValuesStore2 = store2Values.stream().map(Integer::parseInt).map(value -> remappedIndexes[value]).map(Object::toString).collect(Collectors.toSet()); + + Set mergedValues = Sets.union(store1Values, remappedValuesStore2); + mergedInfoStoreValues.put(key, new ConcurrentSkipListSet<>(mergedValues)); + } + + InfoStore infoStore = new InfoStore(infoStore2.description, null, infoStores1Entry.getKey()); + infoStore.allValues = mergedInfoStoreValues; + FileBackedByteIndexedInfoStore mergedStore = new FileBackedByteIndexedInfoStore(new File(outputDirectory), infoStore); + mergedInfoStores.put(infoStores1Entry.getKey(), mergedStore); + writeStore(new File(outputDirectory + infoStore.column_key + "_infoStore.javabin"), mergedStore); + } + + try (FileOutputStream fos = new FileOutputStream(new File(outputDirectory, "variantSpecIndex.javabin")); + GZIPOutputStream gzos = new GZIPOutputStream(fos); + ObjectOutputStream oos = new ObjectOutputStream(gzos);) { + oos.writeObject(variantSpecList); + } + + return mergedInfoStores; + } + + // todo: this is duplicated from VCFPerPatientInfoStoreToFBBIISConverter, refactor to common location + private void writeStore(File outputFile, FileBackedByteIndexedInfoStore fbbiis) + throws FileNotFoundException, IOException { + FileOutputStream fos = new FileOutputStream(outputFile); + GZIPOutputStream gzos = new GZIPOutputStream(fos); + ObjectOutputStream oos = new ObjectOutputStream(gzos); + oos.writeObject(fbbiis); + oos.flush(); + oos.close(); + gzos.flush(); + gzos.close(); + fos.flush(); + fos.close(); + } + + private Map loadInfoStores(String directory) { + Map infoStores = new HashMap<>(); + File genomicDataDirectory = new File(directory); + if(genomicDataDirectory.exists() && genomicDataDirectory.isDirectory()) { + Arrays.stream(genomicDataDirectory.list((file, filename)->{return filename.endsWith("infoStore.javabin");})) + .forEach((String filename)->{ + try ( + FileInputStream fis = new FileInputStream(directory + filename); + GZIPInputStream gis = new GZIPInputStream(fis); + ObjectInputStream ois = new ObjectInputStream(gis) + ){ + log.info("loading " + filename); + FileBackedByteIndexedInfoStore infoStore = (FileBackedByteIndexedInfoStore) ois.readObject(); + infoStores.put(filename.replace("_infoStore.javabin", ""), infoStore); + ois.close(); + } catch (IOException | ClassNotFoundException e) { + e.printStackTrace(); + } + }); + } + return infoStores; + } + + private String[] mergePatientIds() { + return Stream.concat(Arrays.stream(variantStore1.getPatientIds()), Arrays.stream(variantStore2.getPatientIds())) + .toArray(String[]::new); + } + + public Map>> mergeChromosomeMasks() throws FileNotFoundException { + Map>> mergedMaskStorage = new HashMap<>(); + for (String chromosome : variantStore1.getVariantMaskStorage().keySet()) { + mergedMaskStorage.put(chromosome, mergeChromosomeMask(chromosome)); + } + return mergedMaskStorage; + } + + public FileBackedJsonIndexStorage> mergeChromosomeMask(String chromosome) throws FileNotFoundException { + FileBackedJsonIndexStorage> variantMaskStorage1 = variantStore1.getVariantMaskStorage().get(chromosome); + FileBackedJsonIndexStorage> variantMaskStorage2 = variantStore2.getVariantMaskStorage().get(chromosome); + + FileBackedJsonIndexStorage> merged = new FileBackedStorageVariantMasksImpl(new File(outputDirectory + chromosome + "masks.bin")); + variantMaskStorage1.keys().forEach(key -> { + try { + Map masks1 = variantMaskStorage1.get(key); + Map masks2 = variantMaskStorage2.get(key); + if (masks2 == null) { + masks2 = Map.of(); + } + + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + for (Map.Entry entry : masks1.entrySet()) { + VariantMasks variantMasks2 = masks2.get(entry.getKey()); + if (variantMasks2 == null) { + // this will have all null masks, which will result in null when + // appended to a null, or be replaced with an empty bitmask otherwise + variantMasks2 = new VariantMasks(); + } + mergedMasks.put(entry.getKey(), append(entry.getValue(), variantMasks2)); + } + // Any entry in the second set that is not in the merged set can be merged with an empty variant mask, + // if there were a corresponding entry in set 1, it would have been merged in the previous loop + for (Map.Entry entry : masks2.entrySet()) { + if (!mergedMasks.containsKey(entry.getKey())) { + mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); + } + } + merged.put(key, mergedMasks); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + variantMaskStorage2.keys().forEach(key -> { + try { + Map masks2 = variantMaskStorage2.get(key); + for (Map.Entry entry : masks2.entrySet()) { + if (!mergedMasks.containsKey(entry.getKey())) { + mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + return merged; + } + + public VariantMasks append(VariantMasks variantMasks1, VariantMasks variantMasks2) { + VariantMasks appendedMasks = new VariantMasks(); + appendedMasks.homozygousMask = appendMask(variantMasks1.homozygousMask, variantMasks2.homozygousMask); + appendedMasks.heterozygousMask = appendMask(variantMasks1.heterozygousMask, variantMasks2.heterozygousMask); + appendedMasks.homozygousNoCallMask = appendMask(variantMasks1.homozygousNoCallMask, variantMasks2.homozygousNoCallMask); + appendedMasks.heterozygousNoCallMask = appendMask(variantMasks1.heterozygousNoCallMask, variantMasks2.heterozygousNoCallMask); + return appendedMasks; + } + + /** + * Appends one mask to another. This assumes the masks are both padded with '11' on each end + * to prevent overflow issues. + */ + public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { + if (mask1 == null && mask2 == null) { + return null; + } + if (mask1 == null) { + mask1 = variantStore1.emptyBitmask(); + } + if (mask2 == null) { + mask2 = variantStore2.emptyBitmask(); + } + String binaryMask1 = mask1.toString(2); + String binaryMask2 = mask2.toString(2); + String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + + binaryMask2.substring(2); + return new BigInteger(appendedString, 2); + } + + // todo: duplicaed from variant service, refactor + public String[] loadVariantIndexFromFile(String variantSpecIndexFile) { + try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(variantSpecIndexFile)));){ + + List variants = (List) objectInputStream.readObject(); + return variants.toArray(new String[0]); + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } +} From 01efbf093906b4bd9d19f726c7bcb250686849fe Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 16 Jun 2023 13:39:32 -0400 Subject: [PATCH 07/39] ALS-4461: Add jar with dependencies build instructions --- etl/pom.xml | 22 ++++++++++++++++++- .../etl/genotype/GenomicDatasetMerger.java | 14 ++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/etl/pom.xml b/etl/pom.xml index 0b69e8fa..9853188e 100644 --- a/etl/pom.xml +++ b/etl/pom.xml @@ -325,7 +325,27 @@ single - + + + buildGenomicDatasetMerger + + + + edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.GenomicDatasetMerger + + + ${project.basedir}/../docker/pic-sure-hpds-etl + + jar-with-dependencies + + GenomicDatasetMerger + GenomicDatasetMerger + + package + + single + + diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 48f6a80d..b8c3af32 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -49,8 +49,22 @@ private void validate() { } } + /** + * args[0]: directory containing genomic dataset 1 + * args[1]: directory containing genomic dataset 2 + * args[2]: output directory + */ public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { + long time = System.currentTimeMillis(); + GenomicDatasetMerger genomicDatasetMerger = new GenomicDatasetMerger(args[0], args[1], args[2]); + genomicDatasetMerger.merge(); + log.info("Finished in " + (System.currentTimeMillis() - time) + " + ms"); + } + public void merge() throws IOException { + Map>> mergedChromosomeMasks = mergeChromosomeMasks(); + mergeVariantStore(mergedChromosomeMasks); + Map mergedVariantIndexes = mergeVariantIndexes(); } public void mergeVariantStore(Map>> mergedChromosomeMasks) { From 68e849eed5ff33b4bec5d1e736175c8ad658e36c Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Tue, 20 Jun 2023 09:38:28 -0400 Subject: [PATCH 08/39] ALS-4461: Fix issue with hardcoded directory --- .../storage/FileBackedByteIndexedStorage.java | 20 +++++++++---------- .../FileBackedByteIndexedInfoStore.java | 4 ++++ .../etl/genotype/GenomicDatasetMerger.java | 1 + 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java index 0de983d4..eed6286f 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java @@ -1,19 +1,9 @@ package edu.harvard.hms.dbmi.avillach.hpds.storage; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.RandomAccessFile; -import java.io.Serializable; +import java.io.*; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; -import java.util.zip.GZIPInputStream; - -import org.apache.commons.io.output.ByteArrayOutputStream; -import org.codehaus.jackson.map.ObjectMapper; public abstract class FileBackedByteIndexedStorage implements Serializable { private static final long serialVersionUID = -7297090745384302635L; @@ -30,6 +20,14 @@ public FileBackedByteIndexedStorage(Class keyClass, Class valueClass, File this.storage = new RandomAccessFile(this.storageFile, "rw"); } + public void updateStorageDirectory(File storageDirectory) { + if (!storageDirectory.isDirectory()) { + throw new IllegalArgumentException("storageDirectory is not a directory"); + } + String currentStoreageFilename = storageFile.getName(); + storageFile = new File(storageDirectory, currentStoreageFilename); + } + public Set keys(){ return index.keySet(); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java index a92f4aa2..7ba81edb 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java @@ -140,5 +140,9 @@ private static boolean keyHasMultipleValues(String[] keys) { return x>1; } + public void updateStorageDirectory(File storageDirectory) { + allValues.updateStorageDirectory(storageDirectory); + } + } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index b8c3af32..c844e857 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -182,6 +182,7 @@ private Map loadInfoStores(String direct ){ log.info("loading " + filename); FileBackedByteIndexedInfoStore infoStore = (FileBackedByteIndexedInfoStore) ois.readObject(); + infoStore.updateStorageDirectory(genomicDataDirectory); infoStores.put(filename.replace("_infoStore.javabin", ""), infoStore); ois.close(); } catch (IOException | ClassNotFoundException e) { From a141f9230f659c09a773bb7ee47b1bee1208a53b Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Jun 2023 14:32:03 -0400 Subject: [PATCH 09/39] ALS-4461: Fix more issues with non-relative file paths, various refactoring --- .../FileBackedByteIndexedInfoStore.java | 18 +++++++-- .../data/genotype/VariantMetadataIndex.java | 3 +- .../hpds/data/genotype/VariantStore.java | 20 ++++++++++ .../etl/genotype/GenomicDatasetMerger.java | 37 ++----------------- .../hpds/etl/genotype/NewVCFLoader.java | 4 +- ...FPerPatientInfoStoreToFBBIISConverter.java | 17 +-------- .../hpds/processing/VariantService.java | 21 +---------- 7 files changed, 45 insertions(+), 75 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java index 7ba81edb..76ef908d 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java @@ -1,8 +1,6 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; -import java.io.File; -import java.io.IOException; -import java.io.Serializable; +import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.TreeMap; @@ -10,6 +8,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListSet; import java.util.stream.Collectors; +import java.util.zip.GZIPOutputStream; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage; @@ -144,5 +143,18 @@ public void updateStorageDirectory(File storageDirectory) { allValues.updateStorageDirectory(storageDirectory); } + public void write(File outputFile) + throws IOException { + FileOutputStream fos = new FileOutputStream(outputFile); + GZIPOutputStream gzos = new GZIPOutputStream(fos); + ObjectOutputStream oos = new ObjectOutputStream(gzos); + oos.writeObject(this); + oos.flush(); + oos.close(); + gzos.flush(); + gzos.close(); + fos.flush(); + fos.close(); + } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java index 27a1f2f3..9ea75380 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java @@ -201,7 +201,8 @@ public static VariantMetadataIndex createInstance(String metadataIndexPath) { return (VariantMetadataIndex) in.readObject(); } catch(Exception e) { // todo: handle exceptions better - log.error("No Metadata Index found at " + metadataIndexPath, e); + log.info("No Metadata Index found at " + metadataIndexPath); + log.debug("Error loading metadata index:", e); return null; } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index 0365c429..f095fd2c 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -18,6 +18,8 @@ public class VariantStore implements Serializable { private static Logger log = LoggerFactory.getLogger(VariantStore.class); public static final int BUCKET_SIZE = 1000; + public static final String VARIANT_SPEC_INDEX_FILE = "variantSpecIndex.javabin"; + private BigInteger emptyBitmask; private String[] patientIds; @@ -41,6 +43,9 @@ public static VariantStore deserializeInstance(String genomicDataDirectory) thro VariantStore variantStore = (VariantStore) ois.readObject(); ois.close(); variantStore.open(); + variantStore.getVariantMaskStorage().values().forEach(store -> { + store.updateStorageDirectory(new File(genomicDataDirectory)); + }); return variantStore; } else { //we still need an object to reference when checking the variant store, even if it's empty. @@ -202,4 +207,19 @@ public BigInteger emptyBitmask() { return emptyBitmask; } + public static String[] loadVariantIndexFromFile(String genomicDataDirectory) { + try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(genomicDataDirectory + "/" + VARIANT_SPEC_INDEX_FILE)));){ + + List variants = (List) objectInputStream.readObject(); + return variants.toArray(new String[0]); + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index c844e857..498b6ab9 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -84,8 +84,8 @@ public void mergeVariantStore(Map mergeVariantIndexes() throws IOException { - String[] variantIndex1 = loadVariantIndexFromFile(genomicDirectory1 + "variantSpecIndex.javabin"); - String[] variantIndex2 = loadVariantIndexFromFile(genomicDirectory2 + "variantSpecIndex.javabin"); + String[] variantIndex1 = VariantStore.loadVariantIndexFromFile(genomicDirectory1); + String[] variantIndex2 = VariantStore.loadVariantIndexFromFile(genomicDirectory2); Map variantSpecToIndexMap = new HashMap<>(); LinkedList variantSpecList = new LinkedList<>(Arrays.asList(variantIndex1)); @@ -142,7 +142,7 @@ public Map mergeVariantIndexes() throws infoStore.allValues = mergedInfoStoreValues; FileBackedByteIndexedInfoStore mergedStore = new FileBackedByteIndexedInfoStore(new File(outputDirectory), infoStore); mergedInfoStores.put(infoStores1Entry.getKey(), mergedStore); - writeStore(new File(outputDirectory + infoStore.column_key + "_infoStore.javabin"), mergedStore); + mergedStore.write(new File(outputDirectory + infoStore.column_key + "_infoStore.javabin")); } try (FileOutputStream fos = new FileOutputStream(new File(outputDirectory, "variantSpecIndex.javabin")); @@ -154,21 +154,6 @@ public Map mergeVariantIndexes() throws return mergedInfoStores; } - // todo: this is duplicated from VCFPerPatientInfoStoreToFBBIISConverter, refactor to common location - private void writeStore(File outputFile, FileBackedByteIndexedInfoStore fbbiis) - throws FileNotFoundException, IOException { - FileOutputStream fos = new FileOutputStream(outputFile); - GZIPOutputStream gzos = new GZIPOutputStream(fos); - ObjectOutputStream oos = new ObjectOutputStream(gzos); - oos.writeObject(fbbiis); - oos.flush(); - oos.close(); - gzos.flush(); - gzos.close(); - fos.flush(); - fos.close(); - } - private Map loadInfoStores(String directory) { Map infoStores = new HashMap<>(); File genomicDataDirectory = new File(directory); @@ -287,20 +272,4 @@ public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { binaryMask2.substring(2); return new BigInteger(appendedString, 2); } - - // todo: duplicaed from variant service, refactor - public String[] loadVariantIndexFromFile(String variantSpecIndexFile) { - try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(variantSpecIndexFile)));){ - - List variants = (List) objectInputStream.readObject(); - return variants.toArray(new String[0]); - - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } - } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index f85a4eb3..b0adc5eb 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -295,7 +295,7 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed try { if (variantMaskStorage_f.get(lastContigProcessed_f) == null) { String fileName = lastContigProcessed_f + "masks.bin"; - if ("chr".startsWith(fileName)) { + if (!fileName.startsWith("chr")) { fileName = "chr" + fileName; } @@ -386,7 +386,7 @@ private static void saveVariantIndex() throws IOException { private static ConcurrentHashMap convertLoadingMapToMaskMap( HashMap zygosityMaskStrings_f) { - ConcurrentHashMap maskMap = new ConcurrentHashMap<>(); + ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); zygosityMaskStrings_f.entrySet().parallelStream().forEach((entry) -> { maskMap.put(entry.getKey(), new VariantMasks(entry.getValue())); }); diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VCFPerPatientInfoStoreToFBBIISConverter.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VCFPerPatientInfoStoreToFBBIISConverter.java index 2b2b6053..cdfe3cdd 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VCFPerPatientInfoStoreToFBBIISConverter.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VCFPerPatientInfoStoreToFBBIISConverter.java @@ -5,7 +5,6 @@ import java.util.List; import java.util.concurrent.ExecutionException; import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +40,7 @@ public static void convert(File outputFolder, File file) { if (store.allValues.size() > 0) { FileBackedByteIndexedInfoStore fbbiis = new FileBackedByteIndexedInfoStore(outputFolder, store); - writeStore(new File(outputFolder, file.getName()), fbbiis); + fbbiis.write(new File(outputFolder, file.getName())); logger.info("Completed converting InfoStore file: " + file.getAbsolutePath()); } else { logger.info("Skipping empty InfoStore file: " + file.getAbsolutePath() + ""); @@ -55,18 +54,4 @@ public static void convert(File outputFolder, File file) { } } - private static synchronized void writeStore(File outputFile, FileBackedByteIndexedInfoStore fbbiis) - throws FileNotFoundException, IOException { - FileOutputStream fos = new FileOutputStream(outputFile); - GZIPOutputStream gzos = new GZIPOutputStream(fos); - ObjectOutputStream oos = new ObjectOutputStream(gzos); - oos.writeObject(fbbiis); - oos.flush(); - oos.close(); - gzos.flush(); - gzos.close(); - fos.flush(); - fos.close(); - } - } \ No newline at end of file diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index dbf9af30..c3c6efa9 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -31,7 +31,6 @@ public class VariantService { private final String VARIANT_INDEX_FBBIS_STORAGE_FILE; private final String VARIANT_INDEX_FBBIS_FILE; private final String BUCKET_INDEX_BY_SAMPLE_FILE; - private final String VARIANT_SPEC_INDEX_FILE; private final VariantStore variantStore; @@ -56,11 +55,10 @@ public Collection filterVariantSetForPatientSet(Set variantSet, } public VariantService() throws IOException, ClassNotFoundException, InterruptedException { - genomicDataDirectory = System.getProperty("HPDS_DATA_HOME", "/opt/local/hpds/all/"); + genomicDataDirectory = System.getProperty("HPDS_GENOMIC_DATA_DIRECTORY", "/opt/local/hpds/all/"); VARIANT_INDEX_FBBIS_STORAGE_FILE = genomicDataDirectory + "variantIndex_fbbis_storage.javabin"; VARIANT_INDEX_FBBIS_FILE = genomicDataDirectory + "variantIndex_fbbis.javabin"; BUCKET_INDEX_BY_SAMPLE_FILE = genomicDataDirectory + "BucketIndexBySample.javabin"; - VARIANT_SPEC_INDEX_FILE = genomicDataDirectory + "variantSpecIndex.javabin"; variantStore = VariantStore.deserializeInstance(genomicDataDirectory); try { @@ -77,27 +75,12 @@ public String[] loadVariantIndex() { return new String[0]; } - String[] variantIndex = loadVariantIndexFromFile(VARIANT_SPEC_INDEX_FILE); + String[] variantIndex = VariantStore.loadVariantIndexFromFile(genomicDataDirectory); log.info("Index created with " + variantIndex.length + " total variants."); return variantIndex; } - public static String[] loadVariantIndexFromFile(String variantSpecIndexFile) { - try (ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream(variantSpecIndexFile)));){ - - List variants = (List) objectInputStream.readObject(); - return variants.toArray(new String[0]); - - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } - } - /** * This process takes a while (even after the cache is built), so let's spin it out into it's own thread. (not done yet) * @throws FileNotFoundException From 160bc879fa2b1eab4ad6e72aee44f64a65ab24ee Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Jun 2023 14:35:18 -0400 Subject: [PATCH 10/39] ALS-4461: Fix more issues with non-relative file paths, various refactoring --- .../hms/dbmi/avillach/hpds/data/genotype/VariantStore.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index f095fd2c..5a902caf 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -42,10 +42,10 @@ public static VariantStore deserializeInstance(String genomicDataDirectory) thro ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(genomicDataDirectory + "variantStore.javabin"))); VariantStore variantStore = (VariantStore) ois.readObject(); ois.close(); - variantStore.open(); variantStore.getVariantMaskStorage().values().forEach(store -> { store.updateStorageDirectory(new File(genomicDataDirectory)); }); + variantStore.open(); return variantStore; } else { //we still need an object to reference when checking the variant store, even if it's empty. From ee6ee2f8e0c6a0921f1fb359240a78c566f2f1d5 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Jun 2023 15:57:22 -0400 Subject: [PATCH 11/39] ALS-4461: Parallelize chromosome mask merging --- .../hpds/etl/genotype/GenomicDatasetMerger.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 498b6ab9..312da0b9 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -184,10 +184,14 @@ private String[] mergePatientIds() { } public Map>> mergeChromosomeMasks() throws FileNotFoundException { - Map>> mergedMaskStorage = new HashMap<>(); - for (String chromosome : variantStore1.getVariantMaskStorage().keySet()) { - mergedMaskStorage.put(chromosome, mergeChromosomeMask(chromosome)); - } + Map>> mergedMaskStorage = new ConcurrentHashMap<>(); + variantStore1.getVariantMaskStorage().keySet().parallelStream().forEach(chromosome -> { + try { + mergedMaskStorage.put(chromosome, mergeChromosomeMask(chromosome)); + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + }); return mergedMaskStorage; } From df366d0c1207b115df9d496bc57119dddcf60706 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 22 Jun 2023 13:24:06 -0400 Subject: [PATCH 12/39] ALS-4461: Updated hpds version in dockerfile --- docker/pic-sure-hpds/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/pic-sure-hpds/Dockerfile b/docker/pic-sure-hpds/Dockerfile index 0b38a4de..64366119 100644 --- a/docker/pic-sure-hpds/Dockerfile +++ b/docker/pic-sure-hpds/Dockerfile @@ -6,6 +6,6 @@ RUN apk add --no-cache --purge -uU curl wget unzip RUN apk add --no-cache --purge openjdk11 -ADD hpds-war-1.0-SNAPSHOT-war-exec.jar /hpds.jar +ADD hpds-war-2.0.0-SNAPSHOT-war-exec.jar /hpds.jar EXPOSE 8080 From 5e3a93ef9ee926ab97cce4312e143ca68f191f00 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 26 Jun 2023 11:18:00 -0400 Subject: [PATCH 13/39] ALS-4461: Update genomic directory on loading for variant index stores --- .../hms/dbmi/avillach/hpds/processing/AbstractProcessor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java index e73ea8b2..65a41a25 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessor.java @@ -126,6 +126,7 @@ public AbstractProcessor(PhenotypeMetaStore phenotypeMetaStore, VariantService v ){ log.info("loading " + filename); FileBackedByteIndexedInfoStore infoStore = (FileBackedByteIndexedInfoStore) ois.readObject(); + infoStore.updateStorageDirectory(genomicDataDirectory); infoStores.put(filename.replace("_infoStore.javabin", ""), infoStore); ois.close(); } catch (IOException | ClassNotFoundException e) { From 7433fdeb8845e18b929998abf91d96395b5990d4 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 29 Jun 2023 16:57:46 -0400 Subject: [PATCH 14/39] ALS-4461: Change type of variant index store from String (variant spec) to Integer (variant id) --- .../hpds/data/genotype/CompressedIndex.java | 2 +- .../FileBackedByteIndexedInfoStore.java | 40 +++++++++---------- .../hpds/data/genotype/InfoStore.java | 25 +++--------- .../FileBackedStorageVariantIndexImpl.java | 24 +++++++++++ .../etl/genotype/GenomicDatasetMerger.java | 14 +++---- .../hpds/etl/genotype/NewVCFLoader.java | 2 +- .../hpds/processing/VariantIndexCache.java | 16 ++++---- .../processing/AbstractProcessorTest.java | 4 +- 8 files changed, 67 insertions(+), 60 deletions(-) create mode 100644 data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantIndexImpl.java diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/CompressedIndex.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/CompressedIndex.java index 6f010c62..b0814e5f 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/CompressedIndex.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/CompressedIndex.java @@ -33,7 +33,7 @@ public class CompressedIndex implements Serializable { private HashMap, byte[]> compressedRangeMap; private int valueCount; - public TreeMap> buildContinuousValuesMap(FileBackedByteIndexedStorage allValues) { + public TreeMap> buildContinuousValuesMap(FileBackedByteIndexedStorage allValues) { TreeMap> continuousValueMap = new TreeMap<>(); for(String key : allValues.keys()) { try{ diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java index 76ef908d..54a34d9f 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java @@ -10,6 +10,7 @@ import java.util.stream.Collectors; import java.util.zip.GZIPOutputStream; +import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantIndexImpl; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage; @@ -21,12 +22,12 @@ public class FileBackedByteIndexedInfoStore implements Serializable { public boolean isContinuous; public Float min = Float.MAX_VALUE, max = Float.MIN_VALUE; - private FileBackedByteIndexedStorage allValues; + private FileBackedByteIndexedStorage allValues; public TreeMap> continuousValueMap; public CompressedIndex continuousValueIndex; - public FileBackedByteIndexedStorage getAllValues() { + public FileBackedByteIndexedStorage getAllValues() { return allValues; } @@ -41,8 +42,8 @@ public List search(String term) { } } - public void addEntry(String value, String[] variantSpecs) throws IOException { - allValues.put(value, variantSpecs); + public void addEntry(String value, Integer[] variantIds) throws IOException { + allValues.put(value, variantIds); } @@ -51,8 +52,7 @@ public void complete() { } public FileBackedByteIndexedInfoStore(File storageFolder, InfoStore infoStore) throws IOException { - this.allValues = new FileBackedJavaIndexedStorage<>(String.class, String[].class, - new File(storageFolder, infoStore.column_key + "_infoStoreStorage.javabin")); + this.allValues = new FileBackedStorageVariantIndexImpl(new File(storageFolder, infoStore.column_key + "_infoStoreStorage.javabin")); this.description = infoStore.description; this.column_key = infoStore.column_key; this.isContinuous = infoStore.isNumeric(); @@ -71,8 +71,8 @@ public FileBackedByteIndexedInfoStore(File storageFolder, InfoStore infoStore) t if(x%10000 == 0) { System.out.println(infoStore.column_key + " " + ((((double)x) / sortedKeys.size()) * 100) + "% done"); } - ConcurrentSkipListSet variantSpecs = infoStore.allValues.get(key); - addEntry(key, variantSpecs.toArray(new String[variantSpecs.size()])); + ConcurrentSkipListSet variantIds = infoStore.allValues.get(key); + addEntry(key, variantIds.toArray(new Integer[variantIds.size()])); x++; } } @@ -89,10 +89,10 @@ public FileBackedByteIndexedInfoStore(File storageFolder, InfoStore infoStore) t private static void normalizeNumericStore(InfoStore store) { TreeSet allKeys = new TreeSet(store.allValues.keySet()); - ConcurrentHashMap> normalizedValues = new ConcurrentHashMap<>(); + ConcurrentHashMap> normalizedValues = new ConcurrentHashMap<>(); for(String key : allKeys) { String[] keys = key.split(","); - ConcurrentSkipListSet variantSpecs = store.allValues.get(key); + ConcurrentSkipListSet variantIds = store.allValues.get(key); if(key.contentEquals(".")) { //don't add it }else if(keyHasMultipleValues(keys)) { @@ -100,26 +100,26 @@ private static void normalizeNumericStore(InfoStore store) { if(value.contentEquals(".")) { }else { - ConcurrentSkipListSet normalizedSpecs = normalizedValues.get(value); - if(normalizedSpecs == null) { - normalizedSpecs = variantSpecs; + ConcurrentSkipListSet normalizedVariantIds = normalizedValues.get(value); + if(normalizedVariantIds == null) { + normalizedVariantIds = variantIds; }else { - normalizedSpecs.addAll(variantSpecs); + normalizedVariantIds.addAll(variantIds); } - normalizedValues.put(value, normalizedSpecs); + normalizedValues.put(value, normalizedVariantIds); } } }else { if(key.contentEquals(".")) { }else { - ConcurrentSkipListSet normalizedSpecs = normalizedValues.get(key); - if(normalizedSpecs == null) { - normalizedSpecs = variantSpecs; + ConcurrentSkipListSet normalizedVariantIds = normalizedValues.get(key); + if(normalizedVariantIds == null) { + normalizedVariantIds = variantIds; }else { - normalizedSpecs.addAll(variantSpecs); + normalizedVariantIds.addAll(variantIds); } - normalizedValues.put(key, normalizedSpecs); + normalizedValues.put(key, normalizedVariantIds); } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoStore.java index b58373f2..a62e0591 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/InfoStore.java @@ -14,7 +14,7 @@ public class InfoStore implements Serializable { public final String column_key; public final String description; - public ConcurrentHashMap> allValues = new ConcurrentHashMap<>(); + public ConcurrentHashMap> allValues = new ConcurrentHashMap<>(); private String prefix; public List search(String term) { @@ -28,21 +28,6 @@ public List search(String term) { } } - public void processRecord(VariantSpec spec, String[] values) { - for(String value : values) { - if(value.startsWith(column_key + "=")) { - String valueWithoutkey = value.replaceFirst(column_key + "=", ""); - ConcurrentSkipListSet entriesForValue = allValues.get(valueWithoutkey); - if(entriesForValue == null) { - entriesForValue = new ConcurrentSkipListSet<>(); - allValues.put(valueWithoutkey, entriesForValue); - } - entriesForValue.add(spec.specNotation()); - } - } - } - - public InfoStore(String description, String delimiter, String key) { this.prefix = key + "="; this.description = description; @@ -53,7 +38,7 @@ public boolean isNumeric() { int nonNumericCount = 0; int numericCount = 0; System.out.println("Testing for numeric : " + this.column_key + " : " + allValues.size() + " values"); - KeySetView> allKeys = allValues.keySet(); + KeySetView> allKeys = allValues.keySet(); for(String key : allKeys){ try { Double.parseDouble(key); @@ -84,16 +69,16 @@ public boolean isNumeric() { return false; } - public void processRecord(String specNotation, String[] infoValues) { + public void processRecord(Integer variantId, String[] infoValues) { for(String value : infoValues) { if(value.startsWith(prefix)) { String valueWithoutkey = value.substring(prefix.length()); - ConcurrentSkipListSet entriesForValue = allValues.get(valueWithoutkey); + ConcurrentSkipListSet entriesForValue = allValues.get(valueWithoutkey); if(entriesForValue == null) { entriesForValue = new ConcurrentSkipListSet<>(); allValues.put(valueWithoutkey, entriesForValue); } - entriesForValue.add(specNotation); + entriesForValue.add(variantId); } } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantIndexImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantIndexImpl.java new file mode 100644 index 00000000..f2ec9e48 --- /dev/null +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantIndexImpl.java @@ -0,0 +1,24 @@ +package edu.harvard.hms.dbmi.avillach.hpds.data.storage; + +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; +import org.codehaus.jackson.type.TypeReference; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.Serializable; + +public class FileBackedStorageVariantIndexImpl extends FileBackedJsonIndexStorage implements Serializable { + private static final long serialVersionUID = -893724459359928779L; + + public FileBackedStorageVariantIndexImpl(File storageFile) throws FileNotFoundException { + super(storageFile); + } + + private static final TypeReference typeRef + = new TypeReference() {}; + + @Override + public TypeReference getTypeReference() { + return typeRef; + } +} diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 312da0b9..55335174 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -123,18 +123,18 @@ public Map mergeVariantIndexes() throws for (Map.Entry infoStores1Entry : infoStores1.entrySet()) { FileBackedByteIndexedInfoStore infoStore2 = infoStores2.get(infoStores1Entry.getKey()); - FileBackedByteIndexedStorage allValuesStore1 = infoStores1Entry.getValue().getAllValues(); - FileBackedByteIndexedStorage allValuesStore2 = infoStore2.getAllValues(); + FileBackedByteIndexedStorage allValuesStore1 = infoStores1Entry.getValue().getAllValues(); + FileBackedByteIndexedStorage allValuesStore2 = infoStore2.getAllValues(); //FileBackedByteIndexedStorage mergedIndexedStorage = new FileBackedJavaIndexedStorage<>(String.class, String[].class, new File(outputDirectory)); - ConcurrentHashMap> mergedInfoStoreValues = new ConcurrentHashMap<>(); + ConcurrentHashMap> mergedInfoStoreValues = new ConcurrentHashMap<>(); Sets.SetView allKeys = Sets.intersection(allValuesStore1.keys(), allValuesStore2.keys()); for (String key : allKeys) { - Set store1Values = new HashSet<>(Arrays.asList(allValuesStore1.getOrELse(key, new String[]{}))); - Set store2Values = new HashSet<>(Arrays.asList(allValuesStore2.getOrELse(key, new String[]{}))); - Set remappedValuesStore2 = store2Values.stream().map(Integer::parseInt).map(value -> remappedIndexes[value]).map(Object::toString).collect(Collectors.toSet()); + Set store1Values = new HashSet<>(Arrays.asList(allValuesStore1.getOrELse(key, new Integer[]{}))); + Set store2Values = new HashSet<>(Arrays.asList(allValuesStore2.getOrELse(key, new Integer[]{}))); + Set remappedValuesStore2 = store2Values.stream().map(value -> remappedIndexes[value]).collect(Collectors.toSet()); - Set mergedValues = Sets.union(store1Values, remappedValuesStore2); + Set mergedValues = Sets.union(store1Values, remappedValuesStore2); mergedInfoStoreValues.put(key, new ConcurrentSkipListSet<>(mergedValues)); } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index b0adc5eb..af466678 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -477,7 +477,7 @@ public void updateRecords(char[][] zygosityMaskStrings, ConcurrentHashMap { - infoStore.processRecord(Integer.toString(variantIndex), infoColumns); + infoStore.processRecord(variantIndex, infoColumns); }); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java index 09b9dce5..a08ae2fd 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantIndexCache.java @@ -67,23 +67,21 @@ public VariantIndex load(String infoColumn_valueKey) throws IOException { log.debug("Calculating value for cache for key " + infoColumn_valueKey); long time = System.currentTimeMillis(); String[] column_and_value = infoColumn_valueKey.split(COLUMN_AND_KEY_DELIMITER); - String[] variantIndexStringArray = infoStores.get(column_and_value[0]).getAllValues().get(column_and_value[1]); + Integer[] variantIndexIntArray = infoStores.get(column_and_value[0]).getAllValues().get(column_and_value[1]); - if ((double)variantIndexStringArray.length / (double)variantIndex.length < MAX_SPARSE_INDEX_RATIO ) { + if ((double)variantIndexIntArray.length / (double)variantIndex.length < MAX_SPARSE_INDEX_RATIO ) { Set variantIds = new HashSet<>(); - for(String variantIndex : variantIndexStringArray) { - int variantIndexArrayIndex = Integer.parseInt(variantIndex); - variantIds.add(variantIndexArrayIndex); + for(Integer variantIndex : variantIndexIntArray) { + variantIds.add(variantIndex); } return new SparseVariantIndex(variantIds); } else { boolean[] variantIndexArray = new boolean[variantIndex.length]; int x = 0; - for(String variantIndex : variantIndexStringArray) { - int variantIndexArrayIndex = Integer.parseInt(variantIndex); + for(Integer variantIndex : variantIndexIntArray) { // todo: shouldn't this be greater than or equal to 0? 0 is a valid index - if (variantIndexArrayIndex > 0) { - variantIndexArray[variantIndexArrayIndex] = true; + if (variantIndex > 0) { + variantIndexArray[variantIndex] = true; } } log.debug("Cache value for key " + infoColumn_valueKey + " calculated in " + (System.currentTimeMillis() - time) + " ms"); diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java index e22bea5e..9a9e63b4 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AbstractProcessorTest.java @@ -42,12 +42,12 @@ public class AbstractProcessorTest { @Before public void setup() { FileBackedByteIndexedInfoStore mockInfoStore = mock(FileBackedByteIndexedInfoStore.class); - FileBackedByteIndexedStorage mockIndexedStorage = mock(FileBackedByteIndexedStorage.class); + FileBackedByteIndexedStorage mockIndexedStorage = mock(FileBackedByteIndexedStorage.class); when(mockIndexedStorage.keys()).thenReturn(new HashSet<>(EXAMPLE_GENES_WITH_VARIANT)); when(mockInfoStore.getAllValues()).thenReturn(mockIndexedStorage); FileBackedByteIndexedInfoStore mockInfoStore2 = mock(FileBackedByteIndexedInfoStore.class); - FileBackedByteIndexedStorage mockIndexedStorage2 = mock(FileBackedByteIndexedStorage.class); + FileBackedByteIndexedStorage mockIndexedStorage2 = mock(FileBackedByteIndexedStorage.class); when(mockIndexedStorage2.keys()).thenReturn(new HashSet<>(EXAMPLE_VARIANT_SEVERITIES)); when(mockInfoStore2.getAllValues()).thenReturn(mockIndexedStorage2); From f508062a8c05d724ba91e9b04718a7e7bcee97e9 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 5 Jul 2023 10:34:08 -0400 Subject: [PATCH 15/39] ALS-4461: Refactor duplicated variant store read/write code --- .../hpds/data/genotype/VariantStore.java | 15 +++- .../data/genotype/util/RemapPatientIds.java | 5 +- .../etl/genotype/GenomicDatasetMerger.java | 15 +--- .../hpds/etl/genotype/MultialleleCounter.java | 70 +++++++++---------- .../hpds/etl/genotype/NewVCFLoader.java | 6 +- .../hpds/etl/genotype/VariantCounter.java | 42 +++++------ .../genotype/BucketIndexBySampleTest.java | 8 +-- .../hpds/processing/VariantService.java | 2 +- 8 files changed, 74 insertions(+), 89 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index 5a902caf..cfd2e098 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -12,6 +12,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; public class VariantStore implements Serializable { private static final long serialVersionUID = -6970128712587609414L; @@ -37,7 +38,7 @@ public void setVariantMaskStorage(Map listVariants() { ArrayList allVariants = new ArrayList<>(); for (String key : variantMaskStorage.keySet()) { diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/util/RemapPatientIds.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/util/RemapPatientIds.java index cab26774..0bd73458 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/util/RemapPatientIds.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/util/RemapPatientIds.java @@ -51,10 +51,7 @@ public class RemapPatientIds { private static final int TEXT_VALUE = 3; public static void main(String[] args) throws ClassNotFoundException, FileNotFoundException, IOException { - - ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(new FileInputStream("/opt/local/hpds/all/variantStore.javabin"))); - VariantStore variantStore = (VariantStore) objectInputStream.readObject(); - objectInputStream.close(); + VariantStore variantStore = VariantStore.readInstance("/opt/local/hpds/all/"); String[] oldPatientIds = variantStore.getPatientIds(); String[] newPatientIds = new String[oldPatientIds.length]; diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 55335174..7b8bde0e 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -33,8 +33,8 @@ public class GenomicDatasetMerger { public GenomicDatasetMerger(String genomicDirectory1, String genomicDirectory2, String outputDirectory) throws IOException, ClassNotFoundException, InterruptedException { this.genomicDirectory1 = genomicDirectory1; this.genomicDirectory2 = genomicDirectory2; - this.variantStore1 = VariantStore.deserializeInstance(genomicDirectory1); - this.variantStore2 = VariantStore.deserializeInstance(genomicDirectory2); + this.variantStore1 = VariantStore.readInstance(genomicDirectory1); + this.variantStore2 = VariantStore.readInstance(genomicDirectory2); validate(); this.outputDirectory = outputDirectory; @@ -71,16 +71,7 @@ public void mergeVariantStore(Map mergeVariantIndexes() throws IOException { diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java index 13575e33..5f30fa38 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java @@ -16,42 +16,38 @@ public class MultialleleCounter { - public static void main(String[] args) throws ClassNotFoundException, FileNotFoundException, IOException { - try(FileInputStream fis = new FileInputStream("/opt/local/hpds/all/variantStore.javabin"); - ){ - VariantStore variantStore = (VariantStore) new ObjectInputStream(new GZIPInputStream(fis)).readObject(); - variantStore.open(); - for(String contig : variantStore.getVariantMaskStorage().keySet()) { - System.out.println("Starting contig : " + contig); - FileBackedByteIndexedStorage> - currentChromosome = variantStore.getVariantMaskStorage().get(contig); - currentChromosome.keys().parallelStream().forEach((offsetBucket)->{ - System.out.println("Starting bucket : " + offsetBucket); - ConcurrentHashMap maskMap; - try { - maskMap = currentChromosome.get(offsetBucket); + public static void main(String[] args) throws ClassNotFoundException, IOException { + VariantStore variantStore = VariantStore.readInstance("/opt/local/hpds/all/"); + for (String contig : variantStore.getVariantMaskStorage().keySet()) { + System.out.println("Starting contig : " + contig); + FileBackedByteIndexedStorage> + currentChromosome = variantStore.getVariantMaskStorage().get(contig); + currentChromosome.keys().parallelStream().forEach((offsetBucket) -> { + System.out.println("Starting bucket : " + offsetBucket); + ConcurrentHashMap maskMap; + try { + maskMap = currentChromosome.get(offsetBucket); - TreeSet variantsSortedByOffset = new TreeSet(); - for(String variant : maskMap.keySet()) { - variantsSortedByOffset.add(new VariantSpec(variant)); - } - ArrayList variantsSortedByOffsetList = new ArrayList(variantsSortedByOffset); - for(int y = 1; y variantsSortedByOffset = new TreeSet(); + for (String variant : maskMap.keySet()) { + variantsSortedByOffset.add(new VariantSpec(variant)); + } + ArrayList variantsSortedByOffsetList = new ArrayList(variantsSortedByOffset); + for (int y = 1; y < variantsSortedByOffsetList.size(); y++) { + if (variantsSortedByOffsetList.get(y).metadata.offset.equals(variantsSortedByOffsetList.get(y - 1).metadata.offset)) { + try { + System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation() + ":" + maskMap.get(variantsSortedByOffsetList.get(y - 1).specNotation()).heterozygousMask.toString(2) + ":" + ":" + maskMap.get(variantsSortedByOffsetList.get(y).specNotation()).heterozygousMask.toString(2)); + } catch (NullPointerException e) { + System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation()); + } + } + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + System.out.println("Completed bucket : " + offsetBucket); + }); + } + } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index af466678..c24986bb 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -325,11 +325,7 @@ private static void saveVariantStore(VariantStore store, if (storage != null) storage.complete(); } - try (FileOutputStream fos = new FileOutputStream(new File(storageDir, "variantStore.javabin")); - GZIPOutputStream gzos = new GZIPOutputStream(fos); - ObjectOutputStream oos = new ObjectOutputStream(gzos);) { - oos.writeObject(store); - } + store.writeInstance(storageDirStr); logger.debug("Done saving variant masks."); } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java index 7e14ab4c..57d318bb 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java @@ -16,30 +16,26 @@ public class VariantCounter { - public static void main(String[] args) throws ClassNotFoundException, FileNotFoundException, IOException { - try(FileInputStream fis = new FileInputStream("/opt/local/hpds/all/variantStore.javabin"); - ){ - VariantStore variantStore = (VariantStore) new ObjectInputStream(new GZIPInputStream(fis)).readObject(); - variantStore.open(); - for(String contig : variantStore.getVariantMaskStorage().keySet()) { - int[] countOfVariants = {0}; - FileBackedByteIndexedStorage> - currentChromosome = variantStore.getVariantMaskStorage().get(contig); - currentChromosome.keys().parallelStream().forEach((offsetBucket)->{ - ConcurrentHashMap maskMap; - try { - maskMap = currentChromosome.get(offsetBucket); - if(maskMap!=null) { - countOfVariants[0]+=maskMap.size(); - } - - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + public static void main(String[] args) throws ClassNotFoundException, IOException { + VariantStore variantStore = VariantStore.readInstance("/opt/local/hpds/all/"); + for(String contig : variantStore.getVariantMaskStorage().keySet()) { + int[] countOfVariants = {0}; + FileBackedByteIndexedStorage> + currentChromosome = variantStore.getVariantMaskStorage().get(contig); + currentChromosome.keys().parallelStream().forEach((offsetBucket)->{ + ConcurrentHashMap maskMap; + try { + maskMap = currentChromosome.get(offsetBucket); + if(maskMap!=null) { + countOfVariants[0]+=maskMap.size(); } - }); - System.out.println(contig + "," + countOfVariants[0]); - } + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + }); + System.out.println(contig + "," + countOfVariants[0]); } } } diff --git a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySampleTest.java b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySampleTest.java index 69fbfae3..b7474ef9 100644 --- a/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySampleTest.java +++ b/etl/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySampleTest.java @@ -75,12 +75,8 @@ public class BucketIndexBySampleTest { public static void initializeBinfile() throws Exception { //load variant data NewVCFLoader.main(new String[] {VCF_INDEX_FILE, STORAGE_DIR, MERGED_DIR}); - - //read in variantStore object created by VCFLoader - ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(STORAGE_DIR + "variantStore.javabin"))); - variantStore = (VariantStore) ois.readObject(); - ois.close(); - variantStore.open(); + + VariantStore variantStore = VariantStore.readInstance(STORAGE_DIR); //now use that object to initialize the BucketIndexBySample object bucketIndexBySample = new BucketIndexBySample(variantStore, STORAGE_DIR); diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index c3c6efa9..1d50a2d4 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -60,7 +60,7 @@ public VariantService() throws IOException, ClassNotFoundException, InterruptedE VARIANT_INDEX_FBBIS_FILE = genomicDataDirectory + "variantIndex_fbbis.javabin"; BUCKET_INDEX_BY_SAMPLE_FILE = genomicDataDirectory + "BucketIndexBySample.javabin"; - variantStore = VariantStore.deserializeInstance(genomicDataDirectory); + variantStore = VariantStore.readInstance(genomicDataDirectory); try { loadGenomicCacheFiles(); } catch (Exception e) { From 8da85d0d50ae92c6e3be6f0bae8933d3c3c7c267 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 5 Jul 2023 10:55:49 -0400 Subject: [PATCH 16/39] ALS-4461: Refactor duplicated variant store read/write code --- .../hpds/data/genotype/VariantStore.java | 23 +++++++------------ .../hpds/processing/VariantService.java | 14 ++++++++++- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index cfd2e098..cbcdeb6c 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -39,21 +39,14 @@ public void setVariantMaskStorage(Map { - store.updateStorageDirectory(new File(genomicDataDirectory)); - }); - variantStore.open(); - return variantStore; - } else { - //we still need an object to reference when checking the variant store, even if it's empty. - VariantStore variantStore = new VariantStore(); - variantStore.setPatientIds(new String[0]); - return variantStore; - } + ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(genomicDataDirectory + "variantStore.javabin"))); + VariantStore variantStore = (VariantStore) ois.readObject(); + ois.close(); + variantStore.getVariantMaskStorage().values().forEach(store -> { + store.updateStorageDirectory(new File(genomicDataDirectory)); + }); + variantStore.open(); + return variantStore; } public void writeInstance(String genomicDirectory) { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index 1d50a2d4..31731c67 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -60,7 +60,7 @@ public VariantService() throws IOException, ClassNotFoundException, InterruptedE VARIANT_INDEX_FBBIS_FILE = genomicDataDirectory + "variantIndex_fbbis.javabin"; BUCKET_INDEX_BY_SAMPLE_FILE = genomicDataDirectory + "BucketIndexBySample.javabin"; - variantStore = VariantStore.readInstance(genomicDataDirectory); + variantStore = loadVariantStore(); try { loadGenomicCacheFiles(); } catch (Exception e) { @@ -68,6 +68,18 @@ public VariantService() throws IOException, ClassNotFoundException, InterruptedE } } + private VariantStore loadVariantStore() { + VariantStore variantStore; + try { + variantStore = VariantStore.readInstance(genomicDataDirectory); + } catch (Exception e) { + variantStore = new VariantStore(); + variantStore.setPatientIds(new String[0]); + log.warn("Unable to load variant store"); + } + return variantStore; + } + public String[] loadVariantIndex() { //skip if we have no variants if(variantStore.getPatientIds().length == 0) { From 295f52e39e36798624b88845e07671b162e9458e Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 7 Jul 2023 10:40:54 -0400 Subject: [PATCH 17/39] ALS-4461: Fixing thread issues at startup --- .../avillach/hpds/data/genotype/BucketIndexBySample.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 5aa5a0c7..97d7a0e0 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -142,7 +142,7 @@ public void run() { } } }).start(); - + patientIds.parallelStream().forEach((patientId)->{ try { BigInteger patientMask = new BigInteger(new String(patientBucketCharMasks[patientIds.indexOf(patientId)]),2); @@ -153,6 +153,10 @@ public void run() { e.printStackTrace(); } processedPatients[0] += 1; + int processedPatientsCount = processedPatients[0]; + if (processedPatientsCount % 1000 == 0) { + log.info("wrote " + processedPatientsCount + " patient bucket masks"); + } }); patientBucketMasks.complete(); log.info("Done creating patient bucket masks"); From 36904a89a9eb7dcb90f0a199f6bae25985e4e9a5 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 7 Jul 2023 10:56:03 -0400 Subject: [PATCH 18/39] ALS-4461: Fixing thread issues at startup --- .../hpds/data/genotype/BucketIndexBySample.java | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 97d7a0e0..55466ec3 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -127,21 +127,6 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws //the process to write out the bucket masks takes a very long time. //Lets spin up another thread that occasionally logs progress int[] processedPatients = new int[1]; - processedPatients[0] = 0; - new Thread(new Runnable() { - @Override - public void run() { - log.info("writing patient bucket masks to backing store (this may take some time)."); - while(!patientBucketMasks.isComplete()) { - try { - Thread.sleep(5 * 1000 * 60); //log a message every 5 minutes - } catch (InterruptedException e) { - e.printStackTrace(); - } - log.info("wrote " + processedPatients[0] + " patient bucket masks"); - } - } - }).start(); patientIds.parallelStream().forEach((patientId)->{ try { From 9e22c1091a9b7ac1e775f7a38da9aea104b0d188 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 24 Jul 2023 12:19:12 -0400 Subject: [PATCH 19/39] Testing GET/POST bug --- .../harvard/hms/dbmi/avillach/hpds/service/PicSureService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java index 58b81399..20c247ea 100644 --- a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java +++ b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java @@ -286,7 +286,7 @@ public Response querySync(QueryRequest resultRequest) { } } - @GET + @POST @Path("/search/values/") @Override public PaginatedSearchResult searchGenomicConceptValues( From 747b0a19d8305481e2cc83a5d7d1dc1a147f17b6 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 26 Jul 2023 14:11:19 -0400 Subject: [PATCH 20/39] Revert --- .../harvard/hms/dbmi/avillach/hpds/service/PicSureService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java index 20c247ea..58b81399 100644 --- a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java +++ b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java @@ -286,7 +286,7 @@ public Response querySync(QueryRequest resultRequest) { } } - @POST + @GET @Path("/search/values/") @Override public PaginatedSearchResult searchGenomicConceptValues( From c7afde53c9d9ae801fca1cf321417401b626b2ea Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 26 Jul 2023 14:13:49 -0400 Subject: [PATCH 21/39] Revert --- .../harvard/hms/dbmi/avillach/hpds/service/PicSureService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java index 58b81399..20c247ea 100644 --- a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java +++ b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java @@ -286,7 +286,7 @@ public Response querySync(QueryRequest resultRequest) { } } - @GET + @POST @Path("/search/values/") @Override public PaginatedSearchResult searchGenomicConceptValues( From 5e56f09bb25f603e724e01ae3723b830e0211910 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Tue, 8 Aug 2023 09:35:55 -0400 Subject: [PATCH 22/39] ALS-4461: Fix error handling --- .../avillach/hpds/etl/genotype/NewVCFLoader.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index c24986bb..4fa0a4c9 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -158,7 +158,7 @@ private static void loadVCFs(File indexFile) throws IOException { try { walker.nextLine(); } catch (IOException e) { - logger.error("Error reading nextline of VCF file [" + walker.vcfIndexLine.vcfPath + "]", e); + throw new UncheckedIOException(e); } }); zygosityMaskStrings.put(currentSpecNotation, maskStringsForVariantSpec[0]); @@ -206,7 +206,7 @@ private static void loadVCFs(File indexFile) throws IOException { logger.debug(variantSpec + " : homozygous : " + homoIdList); } } catch (IOException e) { - logger.error("an error occurred", e); + throw new UncheckedIOException(e); } } if (count[0] > 50) @@ -305,7 +305,7 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed variantMaskStorage_f.get(lastContigProcessed_f).put(lastChunkProcessed_f, convertLoadingMapToMaskMap(zygosityMaskStrings_f)); } catch (IOException e) { - logger.error("an error occurred", e); + throw new UncheckedIOException(e); } }); if (lastChunkProcessed % 100 == 0) { @@ -344,8 +344,8 @@ public static void splitInfoStoresByColumn() throws FileNotFoundException, IOExc logger.debug("Splitting" + (System.currentTimeMillis() - startTime) + " seconds"); try { VCFPerPatientInfoStoreSplitter.splitAll(storageDir, new File(mergedDirStr)); - } catch (ClassNotFoundException | InterruptedException | ExecutionException e) { - logger.error("Error splitting infostore's by column", e); + } catch (ClassNotFoundException | ExecutionException | InterruptedException e) { + throw new RuntimeException(e); } logger.debug("Split" + (System.currentTimeMillis() - startTime) + " seconds"); } @@ -354,8 +354,8 @@ public static void convertInfoStoresToByteIndexed() throws FileNotFoundException logger.debug("Converting" + (System.currentTimeMillis() - startTime) + " seconds"); try { VCFPerPatientInfoStoreToFBBIISConverter.convertAll(mergedDirStr, storageDirStr); - } catch (ClassNotFoundException | InterruptedException | ExecutionException e) { - logger.error("Error converting infostore to byteindexed", e); + } catch (ClassNotFoundException | ExecutionException | InterruptedException e) { + throw new RuntimeException(e); } logger.debug("Converted " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); } @@ -461,7 +461,7 @@ public void updateRecords(char[][] zygosityMaskStrings, ConcurrentHashMap Date: Wed, 9 Aug 2023 07:57:12 -0400 Subject: [PATCH 23/39] ALS-4461: Rollback testing change --- .../harvard/hms/dbmi/avillach/hpds/service/PicSureService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java index 20c247ea..58b81399 100644 --- a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java +++ b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java @@ -286,7 +286,7 @@ public Response querySync(QueryRequest resultRequest) { } } - @POST + @GET @Path("/search/values/") @Override public PaginatedSearchResult searchGenomicConceptValues( From 15fe3bf65f278fda8495a5a8ae6a82eb173610fd Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 9 Aug 2023 08:44:56 -0400 Subject: [PATCH 24/39] ALS-4461: Clean up error handling in file backed storages --- .../storage/FileBackedByteIndexedStorage.java | 2 +- .../storage/FileBackedJavaIndexedStorage.java | 53 ++++++++++--------- .../storage/FileBackedJsonIndexStorage.java | 44 ++++++++------- 3 files changed, 49 insertions(+), 50 deletions(-) diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java index eed6286f..b01cf429 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java @@ -58,7 +58,7 @@ public void complete() { public boolean isComplete() { return this.completed; } - public abstract V get(K key) throws IOException; + public abstract V get(K key); public V getOrELse(K key, V defaultValue) throws IOException { V result = get(key); diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java index afedbbfa..a1b67b97 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java @@ -38,37 +38,38 @@ private Long[] store(V value) throws IOException { return recordIndex; } - public V get(K key) throws IOException { - if(this.storage==null) { - synchronized(this) { - this.open(); - } - } - Long[] offsetsInStorage = index.get(key); - if(offsetsInStorage != null) { - Long offsetInStorage = index.get(key)[0]; - int offsetLength = index.get(key)[1].intValue(); - if(offsetInStorage != null && offsetLength>0) { - byte[] buffer = new byte[offsetLength]; - synchronized(storage) { - storage.seek(offsetInStorage); - storage.readFully(buffer); + public V get(K key) { + try { + if(this.storage==null) { + synchronized(this) { + this.open(); } - ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new ByteArrayInputStream(buffer))); + } + Long[] offsetsInStorage = index.get(key); + if(offsetsInStorage != null) { + Long offsetInStorage = index.get(key)[0]; + int offsetLength = index.get(key)[1].intValue(); + if(offsetInStorage != null && offsetLength>0) { + byte[] buffer = new byte[offsetLength]; + synchronized(storage) { + storage.seek(offsetInStorage); + storage.readFully(buffer); + } + ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new ByteArrayInputStream(buffer))); - try { - V readObject = (V) in.readObject(); - return readObject; - } catch (ClassNotFoundException e) { - throw new RuntimeException("This should never happen."); - } finally { - in.close(); + try { + V readObject = (V) in.readObject(); + return readObject; + } finally { + in.close(); + } } - }else { - return null; } - } else { return null; + } catch (IOException e) { + throw new UncheckedIOException(e) + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); } } } diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java index c4d751a1..87997767 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java @@ -40,35 +40,33 @@ private Long[] store(V value) throws IOException { return recordIndex; } - public V get(K key) throws IOException { - if(this.storage==null) { - synchronized(this) { - this.open(); - } - } - Long[] offsetsInStorage = index.get(key); - if(offsetsInStorage != null) { - Long offsetInStorage = index.get(key)[0]; - int offsetLength = index.get(key)[1].intValue(); - if(offsetInStorage != null && offsetLength>0) { - byte[] buffer = new byte[offsetLength]; - synchronized(storage) { - storage.seek(offsetInStorage); - storage.readFully(buffer); + public V get(K key) { + try { + if(this.storage==null) { + synchronized(this) { + this.open(); } - try { + } + Long[] offsetsInStorage = index.get(key); + if(offsetsInStorage != null) { + Long offsetInStorage = index.get(key)[0]; + int offsetLength = index.get(key)[1].intValue(); + if(offsetInStorage != null && offsetLength>0) { + byte[] buffer = new byte[offsetLength]; + synchronized(storage) { + storage.seek(offsetInStorage); + storage.readFully(buffer); + } V readObject = readObject(buffer); return readObject; - } catch (Exception e) { - System.out.println("Unable to deserialize, " + e.getMessage()); - System.out.println(new String(buffer)); - throw new RuntimeException(e); + }else { + return null; } - }else { + } else { return null; } - } else { - return null; + } catch (IOException e) { + throw new UncheckedIOException(e); } } From 52cc4a04111fa862748e127b8ef3a7226abdace5 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 9 Aug 2023 08:45:45 -0400 Subject: [PATCH 25/39] ALS-4461: Clean up error handling in file backed storages --- .../avillach/hpds/storage/FileBackedJavaIndexedStorage.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java index a1b67b97..fced78e4 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java @@ -67,7 +67,7 @@ public V get(K key) { } return null; } catch (IOException e) { - throw new UncheckedIOException(e) + throw new UncheckedIOException(e); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } From 88ea1c28a412cdc2ecd72961a301197dc98d7c61 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 9 Aug 2023 10:24:47 -0400 Subject: [PATCH 26/39] ALS-4461: Remove IOExceptions thrown from FBBIS --- .../storage/FileBackedByteIndexedStorage.java | 53 ++++++++++++++++- .../storage/FileBackedJavaIndexedStorage.java | 55 +++--------------- .../storage/FileBackedJsonIndexStorage.java | 56 +----------------- .../data/genotype/BucketIndexBySample.java | 52 +++++------------ .../data/genotype/VariantMetadataIndex.java | 3 +- .../hpds/data/genotype/VariantStore.java | 38 ++++++------ .../etl/genotype/GenomicDatasetMerger.java | 12 ++-- .../hpds/etl/genotype/MultialleleCounter.java | 31 +++++----- .../hpds/etl/genotype/NewVCFLoader.java | 58 ++++++++----------- .../hpds/etl/genotype/VariantCounter.java | 21 ++----- .../hpds/processing/VariantService.java | 17 ++---- 11 files changed, 146 insertions(+), 250 deletions(-) diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java index b01cf429..b3307f29 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java @@ -1,5 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.storage; +import org.apache.commons.io.output.ByteArrayOutputStream; + import java.io.*; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -32,7 +34,22 @@ public Set keys(){ return index.keySet(); } - public abstract void put(K key, V value) throws IOException; + public void put(K key, V value) throws IOException { + if(completed) { + throw new RuntimeException("A completed FileBackedByteIndexedStorage cannot be modified."); + } + Long[] recordIndex; + try (ByteArrayOutputStream out = writeObject(value)) { + recordIndex = new Long[2]; + synchronized (storage) { + storage.seek(storage.length()); + recordIndex[0] = storage.getFilePointer(); + storage.write(out.toByteArray()); + recordIndex[1] = storage.getFilePointer() - recordIndex[0]; + } + } + index.put(key, recordIndex); + } public void load(Iterable values, Function mapper) throws IOException { //make sure we start fresh @@ -58,7 +75,39 @@ public void complete() { public boolean isComplete() { return this.completed; } - public abstract V get(K key); + public V get(K key) { + try { + if(this.storage==null) { + synchronized(this) { + this.open(); + } + } + Long[] offsetsInStorage = index.get(key); + if(offsetsInStorage != null) { + Long offsetInStorage = index.get(key)[0]; + int offsetLength = index.get(key)[1].intValue(); + if(offsetInStorage != null && offsetLength>0) { + byte[] buffer = new byte[offsetLength]; + synchronized(storage) { + storage.seek(offsetInStorage); + storage.readFully(buffer); + } + V readObject = readObject(buffer); + return readObject; + }else { + return null; + } + } else { + return null; + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + protected abstract V readObject(byte[] buffer); + + protected abstract ByteArrayOutputStream writeObject(V value) throws IOException; public V getOrELse(K key, V defaultValue) throws IOException { V result = get(key); diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java index fced78e4..50bf375e 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java @@ -11,61 +11,20 @@ public FileBackedJavaIndexedStorage(Class keyClass, Class valueClass, File super(keyClass, valueClass, storageFile); } - public void put(K key, V value) throws IOException { - if(completed) { - throw new RuntimeException("A completed FileBackedByteIndexedStorage cannot be modified."); - } - Long[] recordIndex = store(value); - index.put(key, recordIndex); - } - - private Long[] store(V value) throws IOException { - + protected ByteArrayOutputStream writeObject(V value) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(out)); oos.writeObject(value); oos.flush(); oos.close(); - - Long[] recordIndex = new Long[2]; - synchronized(storage) { - storage.seek(storage.length()); - recordIndex[0] = storage.getFilePointer(); - storage.write(out.toByteArray()); - recordIndex[1] = storage.getFilePointer() - recordIndex[0]; -// maxStorageSize = storage.getFilePointer(); - } - return recordIndex; + return out; } - public V get(K key) { - try { - if(this.storage==null) { - synchronized(this) { - this.open(); - } - } - Long[] offsetsInStorage = index.get(key); - if(offsetsInStorage != null) { - Long offsetInStorage = index.get(key)[0]; - int offsetLength = index.get(key)[1].intValue(); - if(offsetInStorage != null && offsetLength>0) { - byte[] buffer = new byte[offsetLength]; - synchronized(storage) { - storage.seek(offsetInStorage); - storage.readFully(buffer); - } - ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new ByteArrayInputStream(buffer))); - - try { - V readObject = (V) in.readObject(); - return readObject; - } finally { - in.close(); - } - } - } - return null; + @Override + protected V readObject(byte[] buffer) { + try (ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new ByteArrayInputStream(buffer)));) { + V readObject = (V) in.readObject(); + return readObject; } catch (IOException e) { throw new UncheckedIOException(e); } catch (ClassNotFoundException e) { diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java index 87997767..96f87337 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java @@ -17,57 +17,10 @@ public FileBackedJsonIndexStorage(File storageFile) throws FileNotFoundException super(null, null, storageFile); } - public void put(K key, V value) throws IOException { - if(completed) { - throw new RuntimeException("A completed FileBackedByteIndexedStorage cannot be modified."); - } - Long[] recordIndex = store(value); - index.put(key, recordIndex); - } - - private Long[] store(V value) throws IOException { + protected ByteArrayOutputStream writeObject(V value) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); objectMapper.writeValue(new GZIPOutputStream(out), value); - - Long[] recordIndex = new Long[2]; - synchronized(storage) { - storage.seek(storage.length()); - recordIndex[0] = storage.getFilePointer(); - storage.write(out.toByteArray()); - recordIndex[1] = storage.getFilePointer() - recordIndex[0]; -// maxStorageSize = storage.getFilePointer(); - } - return recordIndex; - } - - public V get(K key) { - try { - if(this.storage==null) { - synchronized(this) { - this.open(); - } - } - Long[] offsetsInStorage = index.get(key); - if(offsetsInStorage != null) { - Long offsetInStorage = index.get(key)[0]; - int offsetLength = index.get(key)[1].intValue(); - if(offsetInStorage != null && offsetLength>0) { - byte[] buffer = new byte[offsetLength]; - synchronized(storage) { - storage.seek(offsetInStorage); - storage.readFully(buffer); - } - V readObject = readObject(buffer); - return readObject; - }else { - return null; - } - } else { - return null; - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } + return out; } protected V readObject(byte[] buffer) { @@ -79,9 +32,4 @@ protected V readObject(byte[] buffer) { } public abstract TypeReference getTypeReference(); - - private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { - in.defaultReadObject(); - objectMapper = new ObjectMapper(); - } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index e8fc14c1..3c39e5b3 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -87,22 +87,18 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws // Create a bitmask with 1 values for each patient who has any variant in this bucket BigInteger[] patientMaskForBucket = {variantStore.emptyBitmask()}; - try { - contigStore.get(bucket).values().forEach((VariantMasks masks)->{ - if(masks.heterozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousMask); - } - //add hetreo no call bits to mask - if(masks.heterozygousNoCallMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousNoCallMask); - } - if(masks.homozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.homozygousMask); - } - }); - } catch (IOException e) { - throw new UncheckedIOException(e); - } + contigStore.get(bucket).values().forEach((VariantMasks masks)->{ + if(masks.heterozygousMask!=null) { + patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousMask); + } + //add hetreo no call bits to mask + if(masks.heterozygousNoCallMask!=null) { + patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousNoCallMask); + } + if(masks.homozygousMask!=null) { + patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.homozygousMask); + } + }); // For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket. int maxIndex = patientMaskForBucket[0].bitLength() - 1; @@ -161,14 +157,9 @@ public Collection filterVariantSetForPatientSet(Set variantSet, new BigInteger(new String(emptyBucketMaskChar()),2) : patientBucketMasks.get(patientSet.get(0)); BigInteger _defaultMask = patientBucketMask; - List patientBucketmasksForSet = patientSet.parallelStream().map((patientNum)->{ - try { - return patientBucketMasks.get(patientNum); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - //return _defaultMask; - }).collect(Collectors.toList()); + List patientBucketmasksForSet = patientSet.parallelStream() + .map((patientNum)-> patientBucketMasks.get(patientNum)) + .collect(Collectors.toList()); for(BigInteger patientMask : patientBucketmasksForSet) { patientBucketMask = patientMask.or(patientBucketMask); } @@ -205,17 +196,4 @@ private char[] emptyBucketMaskChar() { } return _emptyBucketMaskChar.clone(); } - - /** - * Use while debugging - */ - public void printPatientMasks() { - for(Integer patientID : patientBucketMasks.keys()) { - try { - log.info("BucketMask length for " + patientID + ":\t" + patientBucketMasks.get(patientID).toString(2).length()); - } catch (IOException e) { - log.error("FBBIS Error: ", e); - } - } - } } \ No newline at end of file diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java index 9ea75380..1f6cdf0f 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java @@ -92,7 +92,7 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder< log.warn("No bucket found for spec " + variantSpec + " in bucket " + chrOffset); return new String[0]; - } catch (IOException e) { + } catch (UncheckedIOException e) { log.warn("IOException caught looking up variantSpec : " + variantSpec, e); return new String[0]; } @@ -113,7 +113,6 @@ public Map findByMultipleVariantSpec(Collection varien * have to write them to disk once. The data will be written to disk only when the flush() method is called. * * @param variantSpec - * @param array * @throws IOException */ public void put(String variantSpec, String metaData ) throws IOException { diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index ae0100cd..fff09279 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -70,27 +70,23 @@ public Map countVariants() { .get(contig); storage.keys().stream().forEach((Integer key) -> { int[] contigCounts = counts.get(contig); - try { - Collection values = storage.get(key).values(); - contigCounts[0] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { - return masks.heterozygousMask != null ? 1 : 0; - })); - contigCounts[1] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { - return masks.homozygousMask != null ? 1 : 0; - })); - contigCounts[2] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { - return masks.heterozygousNoCallMask != null ? 1 : 0; - })); - contigCounts[3] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { - return masks.homozygousNoCallMask != null ? 1 : 0; - })); - contigCounts[4] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { - return masks.heterozygousMask != null || masks.homozygousMask != null - || masks.heterozygousNoCallMask != null || masks.homozygousNoCallMask != null ? 1 : 0; - })); - } catch (IOException e) { - throw new UncheckedIOException(e); - } + Collection values = storage.get(key).values(); + contigCounts[0] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + return masks.heterozygousMask != null ? 1 : 0; + })); + contigCounts[1] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + return masks.homozygousMask != null ? 1 : 0; + })); + contigCounts[2] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + return masks.heterozygousNoCallMask != null ? 1 : 0; + })); + contigCounts[3] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + return masks.homozygousNoCallMask != null ? 1 : 0; + })); + contigCounts[4] += values.stream().collect(Collectors.summingInt((VariantMasks masks) -> { + return masks.heterozygousMask != null || masks.homozygousMask != null + || masks.heterozygousNoCallMask != null || masks.homozygousNoCallMask != null ? 1 : 0; + })); }); } return counts; diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 7b8bde0e..ec9b1bc1 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -224,15 +224,11 @@ public FileBackedJsonIndexStorage mergedMasks = new ConcurrentHashMap<>(); variantMaskStorage2.keys().forEach(key -> { - try { - Map masks2 = variantMaskStorage2.get(key); - for (Map.Entry entry : masks2.entrySet()) { - if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); - } + Map masks2 = variantMaskStorage2.get(key); + for (Map.Entry entry : masks2.entrySet()) { + if (!mergedMasks.containsKey(entry.getKey())) { + mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); } - } catch (IOException e) { - throw new RuntimeException(e); } }); return merged; diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java index 9979dcab..fec6a83c 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java @@ -4,7 +4,6 @@ import java.util.ArrayList; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; -import java.util.zip.GZIPInputStream; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantSpec; @@ -22,26 +21,22 @@ public static void main(String[] args) throws ClassNotFoundException, IOExceptio currentChromosome.keys().parallelStream().forEach((offsetBucket) -> { System.out.println("Starting bucket : " + offsetBucket); ConcurrentHashMap maskMap; - try { - maskMap = currentChromosome.get(offsetBucket); + maskMap = currentChromosome.get(offsetBucket); - TreeSet variantsSortedByOffset = new TreeSet(); - for (String variant : maskMap.keySet()) { - variantsSortedByOffset.add(new VariantSpec(variant)); - } - ArrayList variantsSortedByOffsetList = new ArrayList(variantsSortedByOffset); - for (int y = 1; y < variantsSortedByOffsetList.size(); y++) { - if (variantsSortedByOffsetList.get(y).metadata.offset.equals(variantsSortedByOffsetList.get(y - 1).metadata.offset)) { - try { - System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation() + ":" + maskMap.get(variantsSortedByOffsetList.get(y - 1).specNotation()).heterozygousMask.toString(2) + ":" + ":" + maskMap.get(variantsSortedByOffsetList.get(y).specNotation()).heterozygousMask.toString(2)); - } catch (NullPointerException e) { - System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation()); - } + TreeSet variantsSortedByOffset = new TreeSet<>(); + for (String variant : maskMap.keySet()) { + variantsSortedByOffset.add(new VariantSpec(variant)); + } + ArrayList variantsSortedByOffsetList = new ArrayList<>(variantsSortedByOffset); + for (int y = 1; y < variantsSortedByOffsetList.size(); y++) { + if (variantsSortedByOffsetList.get(y).metadata.offset.equals(variantsSortedByOffsetList.get(y - 1).metadata.offset)) { + try { + System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation() + ":" + maskMap.get(variantsSortedByOffsetList.get(y - 1).specNotation()).heterozygousMask.toString(2) + ":" + ":" + maskMap.get(variantsSortedByOffsetList.get(y).specNotation()).heterozygousMask.toString(2)); + } catch (NullPointerException e) { + System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation()); } } - } catch (IOException e) { - throw new UncheckedIOException(e); - } + } System.out.println("Completed bucket : " + offsetBucket); }); } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 4fa0a4c9..0bd3eb95 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -191,22 +191,18 @@ private static void loadVCFs(File indexFile) throws IOException { chunkIds.addAll(chromosomeStorage.keys()); for (Integer chunkId : chunkIds) { for (String variantSpec : chromosomeStorage.get(chunkId).keySet()) { - try { - count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); - if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; - String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; - String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); - - if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) - logger.debug(variantSpec + " : heterozygous : " + heteroIdList); - if (!homoIdList.isEmpty() && homoIdList.length() < 1000) - logger.debug(variantSpec + " : homozygous : " + homoIdList); - } - } catch (IOException e) { - throw new UncheckedIOException(e); + count[0]++; + VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + if (variantMasks != null) { + BigInteger heterozygousMask = variantMasks.heterozygousMask; + String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); + BigInteger homozygousMask = variantMasks.homozygousMask; + String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); + + if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) + logger.debug(variantSpec + " : heterozygous : " + heteroIdList); + if (!homoIdList.isEmpty() && homoIdList.length() < 1000) + logger.debug(variantSpec + " : homozygous : " + homoIdList); } } if (count[0] > 50) @@ -217,22 +213,18 @@ private static void loadVCFs(File indexFile) throws IOException { for (int x = chunkIds.size() - 1; x > 0; x--) { int chunkId = chunkIds.get(x); chromosomeStorage.get(chunkId).keySet().forEach((variantSpec) -> { - try { - count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); - if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; - String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; - String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); - - if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) - logger.debug(variantSpec + " : heterozygous : " + heteroIdList); - if (!homoIdList.isEmpty() && homoIdList.length() < 1000) - logger.debug(variantSpec + " : homozygous : " + homoIdList); - } - } catch (IOException e) { - logger.error("an error occurred", e); + count[0]++; + VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + if (variantMasks != null) { + BigInteger heterozygousMask = variantMasks.heterozygousMask; + String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); + BigInteger homozygousMask = variantMasks.homozygousMask; + String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); + + if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) + logger.debug(variantSpec + " : heterozygous : " + heteroIdList); + if (!homoIdList.isEmpty() && homoIdList.length() < 1000) + logger.debug(variantSpec + " : homozygous : " + homoIdList); } }); if (count[0] > 50) @@ -663,7 +655,7 @@ private static List parseVCFIndex(File vcfIndexFile) { } }); } catch (IOException e) { - throw new RuntimeException("IOException caught parsing vcfIndexFile", e); + throw new UncheckedIOException("IOException caught parsing vcfIndexFile", e); } return new ArrayList<>(vcfSet); } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java index a861fa5b..4d22c572 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java @@ -1,16 +1,12 @@ package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; -import java.io.*; -import java.util.ArrayList; -import java.util.TreeSet; -import java.util.concurrent.ConcurrentHashMap; -import java.util.zip.GZIPInputStream; - import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantSpec; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; +import java.io.IOException; +import java.util.concurrent.ConcurrentHashMap; + public class VariantCounter { public static void main(String[] args) throws ClassNotFoundException, IOException { @@ -21,14 +17,9 @@ public static void main(String[] args) throws ClassNotFoundException, IOExceptio currentChromosome = variantStore.getVariantMaskStorage().get(contig); currentChromosome.keys().parallelStream().forEach((offsetBucket)->{ ConcurrentHashMap maskMap; - try { - maskMap = currentChromosome.get(offsetBucket); - if(maskMap!=null) { - countOfVariants[0]+=maskMap.size(); - } - - } catch (IOException e) { - throw new UncheckedIOException(e); + maskMap = currentChromosome.get(offsetBucket); + if(maskMap!=null) { + countOfVariants[0]+=maskMap.size(); } }); System.out.println(contig + "," + countOfVariants[0]); diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java index 739344df..f8f99c0b 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantService.java @@ -54,7 +54,7 @@ public Collection filterVariantSetForPatientSet(Set variantSet, } } - public VariantService() throws IOException, ClassNotFoundException, InterruptedException { + public VariantService() { genomicDataDirectory = System.getProperty("HPDS_GENOMIC_DATA_DIRECTORY", "/opt/local/hpds/all/"); VARIANT_INDEX_FBBIS_STORAGE_FILE = genomicDataDirectory + "variantIndex_fbbis_storage.javabin"; VARIANT_INDEX_FBBIS_FILE = genomicDataDirectory + "variantIndex_fbbis.javabin"; @@ -145,17 +145,10 @@ private void loadGenomicCacheFiles() throws FileNotFoundException, IOException, for( int i = 0; i < bucketCount; i++) { final int _i = i; - ex.submit(new Runnable() { - @Override - public void run() { - try { - String[] variantIndexBucket = indexStore.get(_i); - System.arraycopy(variantIndexBucket, 0, _varaiantIndex2, (_i * VARIANT_INDEX_BLOCK_SIZE), variantIndexBucket.length); - log.info("loaded " + (_i * VARIANT_INDEX_BLOCK_SIZE) + " block"); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } + ex.submit(() -> { + String[] variantIndexBucket = indexStore.get(_i); + System.arraycopy(variantIndexBucket, 0, _varaiantIndex2, (_i * VARIANT_INDEX_BLOCK_SIZE), variantIndexBucket.length); + log.info("loaded " + (_i * VARIANT_INDEX_BLOCK_SIZE) + " block"); }); } objectInputStream.close(); From df0300234caf3c645b9de8de5ce6743ffc3b2427 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 9 Aug 2023 10:51:04 -0400 Subject: [PATCH 27/39] ALS-4461: Remove IOExceptions thrown from FBBIS --- .../storage/FileBackedByteIndexedStorage.java | 10 ++-- .../data/genotype/BucketIndexBySample.java | 10 +--- .../etl/genotype/GenomicDatasetMerger.java | 46 +++++++++---------- 3 files changed, 27 insertions(+), 39 deletions(-) diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java index b3307f29..7530eb6b 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java @@ -13,7 +13,6 @@ public abstract class FileBackedByteIndexedStorage i protected ConcurrentHashMap index; protected File storageFile; protected boolean completed = false; - protected Long maxStorageSize; //leave this in to not break serialization public FileBackedByteIndexedStorage(Class keyClass, Class valueClass, File storageFile) throws FileNotFoundException { @@ -34,7 +33,7 @@ public Set keys(){ return index.keySet(); } - public void put(K key, V value) throws IOException { + public void put(K key, V value) { if(completed) { throw new RuntimeException("A completed FileBackedByteIndexedStorage cannot be modified."); } @@ -47,6 +46,8 @@ public void put(K key, V value) throws IOException { storage.write(out.toByteArray()); recordIndex[1] = storage.getFilePointer() - recordIndex[0]; } + } catch (IOException e) { + throw new UncheckedIOException(e); } index.put(key, recordIndex); } @@ -72,9 +73,6 @@ public void complete() { this.completed = true; } - public boolean isComplete() { - return this.completed; - } public V get(K key) { try { if(this.storage==null) { @@ -109,7 +107,7 @@ public V get(K key) { protected abstract ByteArrayOutputStream writeObject(V value) throws IOException; - public V getOrELse(K key, V defaultValue) throws IOException { + public V getOrELse(K key, V defaultValue) { V result = get(key); return result == null ? defaultValue : result; } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 3c39e5b3..ade0cfce 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -122,14 +122,8 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws int[] processedPatients = new int[1]; patientIds.parallelStream().forEach((patientId)->{ - try { - BigInteger patientMask = new BigInteger(new String(patientBucketCharMasks[patientIds.indexOf(patientId)]),2); - patientBucketMasks.put(patientId, patientMask); - }catch(NumberFormatException e) { - log.error("NFE caught for " + patientId, e); - } catch (IOException e) { - throw new UncheckedIOException(e); - } + BigInteger patientMask = new BigInteger(new String(patientBucketCharMasks[patientIds.indexOf(patientId)]),2); + patientBucketMasks.put(patientId, patientMask); processedPatients[0] += 1; int processedPatientsCount = processedPatients[0]; if (processedPatientsCount % 1000 == 0) { diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index ec9b1bc1..315bc494 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -162,7 +162,7 @@ private Map loadInfoStores(String direct infoStores.put(filename.replace("_infoStore.javabin", ""), infoStore); ois.close(); } catch (IOException | ClassNotFoundException e) { - e.printStackTrace(); + throw new RuntimeException(e); } }); } @@ -192,34 +192,30 @@ public FileBackedJsonIndexStorage> merged = new FileBackedStorageVariantMasksImpl(new File(outputDirectory + chromosome + "masks.bin")); variantMaskStorage1.keys().forEach(key -> { - try { - Map masks1 = variantMaskStorage1.get(key); - Map masks2 = variantMaskStorage2.get(key); - if (masks2 == null) { - masks2 = Map.of(); - } + Map masks1 = variantMaskStorage1.get(key); + Map masks2 = variantMaskStorage2.get(key); + if (masks2 == null) { + masks2 = Map.of(); + } - ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); - for (Map.Entry entry : masks1.entrySet()) { - VariantMasks variantMasks2 = masks2.get(entry.getKey()); - if (variantMasks2 == null) { - // this will have all null masks, which will result in null when - // appended to a null, or be replaced with an empty bitmask otherwise - variantMasks2 = new VariantMasks(); - } - mergedMasks.put(entry.getKey(), append(entry.getValue(), variantMasks2)); + ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); + for (Map.Entry entry : masks1.entrySet()) { + VariantMasks variantMasks2 = masks2.get(entry.getKey()); + if (variantMasks2 == null) { + // this will have all null masks, which will result in null when + // appended to a null, or be replaced with an empty bitmask otherwise + variantMasks2 = new VariantMasks(); } - // Any entry in the second set that is not in the merged set can be merged with an empty variant mask, - // if there were a corresponding entry in set 1, it would have been merged in the previous loop - for (Map.Entry entry : masks2.entrySet()) { - if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); - } + mergedMasks.put(entry.getKey(), append(entry.getValue(), variantMasks2)); + } + // Any entry in the second set that is not in the merged set can be merged with an empty variant mask, + // if there were a corresponding entry in set 1, it would have been merged in the previous loop + for (Map.Entry entry : masks2.entrySet()) { + if (!mergedMasks.containsKey(entry.getKey())) { + mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); } - merged.put(key, mergedMasks); - } catch (IOException e) { - throw new RuntimeException(e); } + merged.put(key, mergedMasks); }); ConcurrentHashMap mergedMasks = new ConcurrentHashMap<>(); From b0d13ea5457b5cbea1ba6c55ffd2b7fc8cd4ae41 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 9 Aug 2023 15:10:58 -0400 Subject: [PATCH 28/39] ALS-4461: Fix deserialization issue --- .../hpds/storage/FileBackedByteIndexedStorage.java | 2 -- .../hpds/storage/FileBackedJavaIndexedStorage.java | 2 -- .../avillach/hpds/storage/FileBackedJsonIndexStorage.java | 7 ++++++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java index 7530eb6b..2b930927 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java @@ -1,7 +1,5 @@ package edu.harvard.hms.dbmi.avillach.hpds.storage; -import org.apache.commons.io.output.ByteArrayOutputStream; - import java.io.*; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java index 50bf375e..26968729 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJavaIndexedStorage.java @@ -1,7 +1,5 @@ package edu.harvard.hms.dbmi.avillach.hpds.storage; -import org.apache.commons.io.output.ByteArrayOutputStream; - import java.io.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java index 96f87337..f8f97c6f 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedJsonIndexStorage.java @@ -1,6 +1,5 @@ package edu.harvard.hms.dbmi.avillach.hpds.storage; -import org.apache.commons.io.output.ByteArrayOutputStream; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.type.TypeReference; @@ -31,5 +30,11 @@ protected V readObject(byte[] buffer) { } } + // Required to populate the objectMapper on deserialization + private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { + in.defaultReadObject(); + objectMapper = new ObjectMapper(); + } + public abstract TypeReference getTypeReference(); } From 28b26721b6c5103a8388eaed295abb934320ff16 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 10 Aug 2023 11:47:38 -0400 Subject: [PATCH 29/39] ALS-4461: Add comment explaining chromosome index merging --- .../etl/genotype/GenomicDatasetMerger.java | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 315bc494..1eae5fda 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -174,7 +174,11 @@ private String[] mergePatientIds() { .toArray(String[]::new); } - public Map>> mergeChromosomeMasks() throws FileNotFoundException { + /** + * For each chromosome, call mergeChromosomeMask to merge the masks + * @return + */ + public Map>> mergeChromosomeMasks() { Map>> mergedMaskStorage = new ConcurrentHashMap<>(); variantStore1.getVariantMaskStorage().keySet().parallelStream().forEach(chromosome -> { try { @@ -186,6 +190,48 @@ public Map { + * "chr22,10001031,A,G" -> "10101010", + * "chr22,10001143,G,A" -> "10101010" + * }, + * 10002 -> { + * "chr22,10002031,A,G" -> "10101010", + * "chr22,10002143,G,A" -> "10101010" + * } + * } + * variantMaskStorage2: { + * 10001 -> { + * "chr22,10001031,A,G" -> "00001111", + * "chr22,10001213,A,G" -> "00001111" + * }, + * 10003 -> { + * "chr22,10003031,A,G" -> "00001111", + * "chr22,10003213,A,G" -> "00001111" + * } + * } + * + * mergedVariantMaskStorage: { + * 10001 -> { + * "chr22,10001031,A,G" -> "1010101000001111", + * "chr22,10001213,A,G" -> "0000000000001111", + * "chr22,10001143,G,A" -> "1010101000000000" + * }, + * 10002 -> { + * "chr22,10002031,A,G" -> "1010101000000000", + * "chr22,10002143,G,A" -> "1010101000000000" + * } + * 10003 -> { + * "chr22,10003031,A,G" -> "0000000000001111", + * "chr22,10003213,A,G" -> "0000000000001111" + * } + * } + */ public FileBackedJsonIndexStorage> mergeChromosomeMask(String chromosome) throws FileNotFoundException { FileBackedJsonIndexStorage> variantMaskStorage1 = variantStore1.getVariantMaskStorage().get(chromosome); FileBackedJsonIndexStorage> variantMaskStorage2 = variantStore2.getVariantMaskStorage().get(chromosome); From 9bc9e2189e7eec1398d9c1d3f7a5670957065832 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 10 Aug 2023 12:08:01 -0400 Subject: [PATCH 30/39] ALS-4461: Add comments --- .../etl/genotype/GenomicDatasetMerger.java | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 1eae5fda..0e7e0ed7 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -74,6 +74,28 @@ public void mergeVariantStore(Map mergeVariantIndexes() throws IOException { String[] variantIndex1 = VariantStore.loadVariantIndexFromFile(genomicDirectory1); String[] variantIndex2 = VariantStore.loadVariantIndexFromFile(genomicDirectory2); @@ -160,7 +182,6 @@ private Map loadInfoStores(String direct FileBackedByteIndexedInfoStore infoStore = (FileBackedByteIndexedInfoStore) ois.readObject(); infoStore.updateStorageDirectory(genomicDataDirectory); infoStores.put(filename.replace("_infoStore.javabin", ""), infoStore); - ois.close(); } catch (IOException | ClassNotFoundException e) { throw new RuntimeException(e); } @@ -169,6 +190,11 @@ private Map loadInfoStores(String direct return infoStores; } + /** + * Merge patient ids from both variant stores. We are simply appending patients from store 2 to patients from store 1 + * + * @return the merged patient ids + */ private String[] mergePatientIds() { return Stream.concat(Arrays.stream(variantStore1.getPatientIds()), Arrays.stream(variantStore2.getPatientIds())) .toArray(String[]::new); From b22cc1bba4151d55f1dcd43ed9d52509bafffb4d Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Fri, 11 Aug 2023 12:41:01 -0400 Subject: [PATCH 31/39] ALS-4461: Changes per PR --- .../etl/genotype/GenomicDatasetMerger.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 0e7e0ed7..235dbb29 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -20,6 +20,8 @@ public class GenomicDatasetMerger { + public static final String INFO_STORE_JAVABIN_SUFFIX = "infoStore.javabin"; + public static final String VARIANT_SPEC_INDEX_FILENAME = "variantSpecIndex.javabin"; private static Logger log = LoggerFactory.getLogger(GenomicDatasetMerger.class); private final VariantStore variantStore1; @@ -64,7 +66,7 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio public void merge() throws IOException { Map>> mergedChromosomeMasks = mergeChromosomeMasks(); mergeVariantStore(mergedChromosomeMasks); - Map mergedVariantIndexes = mergeVariantIndexes(); + mergeVariantIndexes(); } public void mergeVariantStore(Map>> mergedChromosomeMasks) { @@ -138,13 +140,12 @@ public Map mergeVariantIndexes() throws FileBackedByteIndexedStorage allValuesStore1 = infoStores1Entry.getValue().getAllValues(); FileBackedByteIndexedStorage allValuesStore2 = infoStore2.getAllValues(); - //FileBackedByteIndexedStorage mergedIndexedStorage = new FileBackedJavaIndexedStorage<>(String.class, String[].class, new File(outputDirectory)); ConcurrentHashMap> mergedInfoStoreValues = new ConcurrentHashMap<>(); - Sets.SetView allKeys = Sets.intersection(allValuesStore1.keys(), allValuesStore2.keys()); + Sets.SetView allKeys = Sets.union(allValuesStore1.keys(), allValuesStore2.keys()); for (String key : allKeys) { - Set store1Values = new HashSet<>(Arrays.asList(allValuesStore1.getOrELse(key, new Integer[]{}))); - Set store2Values = new HashSet<>(Arrays.asList(allValuesStore2.getOrELse(key, new Integer[]{}))); + Set store1Values = Set.of(allValuesStore1.getOrELse(key, new Integer[]{})); + Set store2Values = Set.of(allValuesStore2.getOrELse(key, new Integer[]{})); Set remappedValuesStore2 = store2Values.stream().map(value -> remappedIndexes[value]).collect(Collectors.toSet()); Set mergedValues = Sets.union(store1Values, remappedValuesStore2); @@ -155,10 +156,10 @@ public Map mergeVariantIndexes() throws infoStore.allValues = mergedInfoStoreValues; FileBackedByteIndexedInfoStore mergedStore = new FileBackedByteIndexedInfoStore(new File(outputDirectory), infoStore); mergedInfoStores.put(infoStores1Entry.getKey(), mergedStore); - mergedStore.write(new File(outputDirectory + infoStore.column_key + "_infoStore.javabin")); + mergedStore.write(new File(outputDirectory + infoStore.column_key + "_" + INFO_STORE_JAVABIN_SUFFIX)); } - try (FileOutputStream fos = new FileOutputStream(new File(outputDirectory, "variantSpecIndex.javabin")); + try (FileOutputStream fos = new FileOutputStream(new File(outputDirectory, VARIANT_SPEC_INDEX_FILENAME)); GZIPOutputStream gzos = new GZIPOutputStream(fos); ObjectOutputStream oos = new ObjectOutputStream(gzos);) { oos.writeObject(variantSpecList); @@ -171,7 +172,7 @@ private Map loadInfoStores(String direct Map infoStores = new HashMap<>(); File genomicDataDirectory = new File(directory); if(genomicDataDirectory.exists() && genomicDataDirectory.isDirectory()) { - Arrays.stream(genomicDataDirectory.list((file, filename)->{return filename.endsWith("infoStore.javabin");})) + Arrays.stream(genomicDataDirectory.list((file, filename)->{return filename.endsWith(INFO_STORE_JAVABIN_SUFFIX);})) .forEach((String filename)->{ try ( FileInputStream fis = new FileInputStream(directory + filename); @@ -181,7 +182,7 @@ private Map loadInfoStores(String direct log.info("loading " + filename); FileBackedByteIndexedInfoStore infoStore = (FileBackedByteIndexedInfoStore) ois.readObject(); infoStore.updateStorageDirectory(genomicDataDirectory); - infoStores.put(filename.replace("_infoStore.javabin", ""), infoStore); + infoStores.put(filename.replace("_" + INFO_STORE_JAVABIN_SUFFIX, ""), infoStore); } catch (IOException | ClassNotFoundException e) { throw new RuntimeException(e); } From 15c9f00bc315fb7342f06e77495c25d1cfff9d6c Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 17 Aug 2023 13:52:20 -0400 Subject: [PATCH 32/39] ALS-4461: Refactor variant spec index to make testing easier --- .../hpds/data/genotype/VariantStore.java | 23 ++++++++++++--- .../etl/genotype/GenomicDatasetMerger.java | 29 +++++++++---------- .../hpds/etl/genotype/NewVCFLoader.java | 13 ++------- 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java index fff09279..6541c9dc 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantStore.java @@ -24,6 +24,8 @@ public class VariantStore implements Serializable { private BigInteger emptyBitmask; private String[] patientIds; + private transient String[] variantSpecIndex; + private Integer variantStorageSize; private String[] vcfHeaders = new String[24]; @@ -38,6 +40,13 @@ public void setVariantMaskStorage(Map>> mergedChromosomeMasks = mergeChromosomeMasks(); - mergeVariantStore(mergedChromosomeMasks); + VariantStore mergedVariantStore = mergeVariantStore(mergedChromosomeMasks); mergeVariantIndexes(); + return mergedVariantStore; } - public void mergeVariantStore(Map>> mergedChromosomeMasks) { - VariantStore mergedVariantStore = new VariantStore(); + public VariantStore mergeVariantStore(Map>> mergedChromosomeMasks) { mergedVariantStore.setVariantMaskStorage(mergedChromosomeMasks); mergedVariantStore.setPatientIds(mergePatientIds()); - mergedVariantStore.writeInstance(outputDirectory); + return mergedVariantStore; } /** @@ -159,12 +163,7 @@ public Map mergeVariantIndexes() throws mergedStore.write(new File(outputDirectory + infoStore.column_key + "_" + INFO_STORE_JAVABIN_SUFFIX)); } - try (FileOutputStream fos = new FileOutputStream(new File(outputDirectory, VARIANT_SPEC_INDEX_FILENAME)); - GZIPOutputStream gzos = new GZIPOutputStream(fos); - ObjectOutputStream oos = new ObjectOutputStream(gzos);) { - oos.writeObject(variantSpecList); - } - + mergedVariantStore.setVariantSpecIndex(variantSpecList.toArray(new String[0])); return mergedInfoStores; } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 0bd3eb95..145667b2 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -171,8 +171,6 @@ private static void loadVCFs(File indexFile) throws IOException { shutdownChunkWriteExecutor(); - saveVariantStore(store, variantMaskStorage); - saveInfoStores(); splitInfoStoresByColumn(); @@ -234,7 +232,8 @@ private static void loadVCFs(File indexFile) throws IOException { } } - saveVariantIndex(); + store.setVariantSpecIndex(variantIndexBuilder.getVariantSpecIndex().toArray(new String[0])); + saveVariantStore(store, variantMaskStorage); } private static String sampleIdsForMask(String[] sampleIds, BigInteger heterozygousMask) { @@ -364,14 +363,6 @@ private static void shutdownChunkWriteExecutor() { } } - private static void saveVariantIndex() throws IOException { - try (FileOutputStream fos = new FileOutputStream(new File(storageDir, "variantSpecIndex.javabin")); - GZIPOutputStream gzos = new GZIPOutputStream(fos); - ObjectOutputStream oos = new ObjectOutputStream(gzos);) { - oos.writeObject(variantIndexBuilder.getVariantSpecIndex()); - } - } - private static ConcurrentHashMap convertLoadingMapToMaskMap( HashMap zygosityMaskStrings_f) { ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); From 0a5fde55dd9e647ad9ec39d0bba84d176fb9cb2e Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Thu, 17 Aug 2023 18:26:19 -0400 Subject: [PATCH 33/39] ALS-4461: Refactor genomic dataset merger to support testing --- .../FileBackedByteIndexedInfoStore.java | 21 +++-- .../etl/genotype/GenomicDatasetMerger.java | 66 +++------------- .../genotype/GenomicDatasetMergerRunner.java | 78 +++++++++++++++++++ 3 files changed, 99 insertions(+), 66 deletions(-) create mode 100644 etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java index 54a34d9f..4fd4ae37 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/FileBackedByteIndexedInfoStore.java @@ -143,18 +143,15 @@ public void updateStorageDirectory(File storageDirectory) { allValues.updateStorageDirectory(storageDirectory); } - public void write(File outputFile) - throws IOException { - FileOutputStream fos = new FileOutputStream(outputFile); - GZIPOutputStream gzos = new GZIPOutputStream(fos); - ObjectOutputStream oos = new ObjectOutputStream(gzos); - oos.writeObject(this); - oos.flush(); - oos.close(); - gzos.flush(); - gzos.close(); - fos.flush(); - fos.close(); + public void write(File outputFile) { + try( + FileOutputStream fos = new FileOutputStream(outputFile); + GZIPOutputStream gzos = new GZIPOutputStream(fos); + ObjectOutputStream oos = new ObjectOutputStream(gzos);) { + oos.writeObject(this); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 79e3fa88..c7982ed9 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -20,29 +20,26 @@ public class GenomicDatasetMerger { - public static final String INFO_STORE_JAVABIN_SUFFIX = "infoStore.javabin"; - public static final String VARIANT_SPEC_INDEX_FILENAME = "variantSpecIndex.javabin"; private static Logger log = LoggerFactory.getLogger(GenomicDatasetMerger.class); private final VariantStore variantStore1; private final VariantStore variantStore2; - private final VariantStore mergedVariantStore; - - private final String genomicDirectory1; - private final String genomicDirectory2; + private final Map infoStores1; + private final Map infoStores2; private final String outputDirectory; - public GenomicDatasetMerger(String genomicDirectory1, String genomicDirectory2, String outputDirectory) throws IOException, ClassNotFoundException, InterruptedException { - this.genomicDirectory1 = genomicDirectory1; - this.genomicDirectory2 = genomicDirectory2; - this.variantStore1 = VariantStore.readInstance(genomicDirectory1); - this.variantStore2 = VariantStore.readInstance(genomicDirectory2); - this.mergedVariantStore = new VariantStore(); + private final VariantStore mergedVariantStore; - validate(); + public GenomicDatasetMerger(VariantStore variantStore1, VariantStore variantStore2, Map infoStores1, Map infoStores2, String outputDirectory) { + this.variantStore1 = variantStore1; + this.variantStore2 = variantStore2; + this.mergedVariantStore = new VariantStore(); + this.infoStores1 = infoStores1; + this.infoStores2 = infoStores2; this.outputDirectory = outputDirectory; + validate(); } private void validate() { @@ -54,19 +51,6 @@ private void validate() { } } - /** - * args[0]: directory containing genomic dataset 1 - * args[1]: directory containing genomic dataset 2 - * args[2]: output directory - */ - public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { - String outputDirectory = args[2]; - GenomicDatasetMerger genomicDatasetMerger = new GenomicDatasetMerger(args[0], args[1], outputDirectory); - VariantStore mergedVariantStore = genomicDatasetMerger.merge(); - - mergedVariantStore.writeInstance(outputDirectory); - } - public VariantStore merge() throws IOException { Map>> mergedChromosomeMasks = mergeChromosomeMasks(); VariantStore mergedVariantStore = mergeVariantStore(mergedChromosomeMasks); @@ -103,8 +87,8 @@ public VariantStore mergeVariantStore(Map mergeVariantIndexes() throws IOException { - String[] variantIndex1 = VariantStore.loadVariantIndexFromFile(genomicDirectory1); - String[] variantIndex2 = VariantStore.loadVariantIndexFromFile(genomicDirectory2); + String[] variantIndex1 = variantStore1.getVariantSpecIndex(); + String[] variantIndex2 = variantStore2.getVariantSpecIndex(); Map variantSpecToIndexMap = new HashMap<>(); LinkedList variantSpecList = new LinkedList<>(Arrays.asList(variantIndex1)); @@ -132,8 +116,6 @@ public Map mergeVariantIndexes() throws } } - Map infoStores1 = loadInfoStores(genomicDirectory1); - Map infoStores2 = loadInfoStores(genomicDirectory2); Map mergedInfoStores = new HashMap<>(); if (!infoStores1.keySet().equals(infoStores2.keySet())) { @@ -160,36 +142,12 @@ public Map mergeVariantIndexes() throws infoStore.allValues = mergedInfoStoreValues; FileBackedByteIndexedInfoStore mergedStore = new FileBackedByteIndexedInfoStore(new File(outputDirectory), infoStore); mergedInfoStores.put(infoStores1Entry.getKey(), mergedStore); - mergedStore.write(new File(outputDirectory + infoStore.column_key + "_" + INFO_STORE_JAVABIN_SUFFIX)); } mergedVariantStore.setVariantSpecIndex(variantSpecList.toArray(new String[0])); return mergedInfoStores; } - private Map loadInfoStores(String directory) { - Map infoStores = new HashMap<>(); - File genomicDataDirectory = new File(directory); - if(genomicDataDirectory.exists() && genomicDataDirectory.isDirectory()) { - Arrays.stream(genomicDataDirectory.list((file, filename)->{return filename.endsWith(INFO_STORE_JAVABIN_SUFFIX);})) - .forEach((String filename)->{ - try ( - FileInputStream fis = new FileInputStream(directory + filename); - GZIPInputStream gis = new GZIPInputStream(fis); - ObjectInputStream ois = new ObjectInputStream(gis) - ){ - log.info("loading " + filename); - FileBackedByteIndexedInfoStore infoStore = (FileBackedByteIndexedInfoStore) ois.readObject(); - infoStore.updateStorageDirectory(genomicDataDirectory); - infoStores.put(filename.replace("_" + INFO_STORE_JAVABIN_SUFFIX, ""), infoStore); - } catch (IOException | ClassNotFoundException e) { - throw new RuntimeException(e); - } - }); - } - return infoStores; - } - /** * Merge patient ids from both variant stores. We are simply appending patients from store 2 to patients from store 1 * diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java new file mode 100644 index 00000000..8186a350 --- /dev/null +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java @@ -0,0 +1,78 @@ +package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; + +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; +import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.zip.GZIPInputStream; + +public class GenomicDatasetMergerRunner { + + private static Logger log = LoggerFactory.getLogger(GenomicDatasetMerger.class); + + public static final String INFO_STORE_JAVABIN_SUFFIX = "infoStore.javabin"; + public static final String VARIANT_SPEC_INDEX_FILENAME = "variantSpecIndex.javabin"; + + private static String genomicDirectory1; + private static String genomicDirectory2; + + /** + * args[0]: directory containing genomic dataset 1 + * args[1]: directory containing genomic dataset 2 + * args[2]: output directory + */ + public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { + // todo :validation + genomicDirectory1 = args[0]; + genomicDirectory2 = args[1]; + String outputDirectory = args[2]; + + Map infoStores1 = loadInfoStores(genomicDirectory1); + Map infoStores2 = loadInfoStores(genomicDirectory2); + + GenomicDatasetMerger genomicDatasetMerger = new GenomicDatasetMerger(VariantStore.readInstance(genomicDirectory1),VariantStore.readInstance(genomicDirectory2), infoStores1, infoStores2, outputDirectory); + + Map>> mergedChromosomeMasks = genomicDatasetMerger.mergeChromosomeMasks(); + VariantStore mergedVariantStore = genomicDatasetMerger.mergeVariantStore(mergedChromosomeMasks); + Map variantIndexes = genomicDatasetMerger.mergeVariantIndexes(); + + mergedVariantStore.writeInstance(outputDirectory); + variantIndexes.values().forEach(variantIndex -> { + variantIndex.write(new File(outputDirectory + variantIndex.column_key + "_" + INFO_STORE_JAVABIN_SUFFIX)); + }); + } + + private static Map loadInfoStores(String directory) { + Map infoStores = new HashMap<>(); + File genomicDataDirectory = new File(directory); + if(genomicDataDirectory.exists() && genomicDataDirectory.isDirectory()) { + Arrays.stream(genomicDataDirectory.list((file, filename)->{return filename.endsWith(INFO_STORE_JAVABIN_SUFFIX);})) + .forEach((String filename)->{ + try ( + FileInputStream fis = new FileInputStream(directory + filename); + GZIPInputStream gis = new GZIPInputStream(fis); + ObjectInputStream ois = new ObjectInputStream(gis) + ){ + log.info("loading " + filename); + FileBackedByteIndexedInfoStore infoStore = (FileBackedByteIndexedInfoStore) ois.readObject(); + infoStore.updateStorageDirectory(genomicDataDirectory); + infoStores.put(filename.replace("_" + INFO_STORE_JAVABIN_SUFFIX, ""), infoStore); + } catch (IOException | ClassNotFoundException e) { + throw new RuntimeException(e); + } + }); + } + return infoStores; + } +} From abb2659dab69e7aeea09c7fe09b667a99f23d2a1 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 21 Aug 2023 10:51:44 -0400 Subject: [PATCH 34/39] ALS-4461: Add validation to prevent patient id duplicates --- .../dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index c7982ed9..35170967 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -49,6 +49,10 @@ private void validate() { log.error(String.join(", ", variantStore2.getVariantMaskStorage().keySet())); throw new IllegalStateException("Unable to merge variant stores with different numbers of chromosomes"); } + Sets.SetView patientIntersection = Sets.intersection(Sets.newHashSet(variantStore1.getPatientIds()), Sets.newHashSet(variantStore2.getPatientIds())); + if (!patientIntersection.isEmpty()) { + throw new IllegalStateException("Cannot merge genomic datasets containing the same patient id"); + } } public VariantStore merge() throws IOException { From 93c5a17a0d1fc66c8eb2d995fb960259f18c9f20 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 21 Aug 2023 10:52:43 -0400 Subject: [PATCH 35/39] ALS-4461: Fix GenomicDatasetMerger name in jar with dependencies config --- etl/pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etl/pom.xml b/etl/pom.xml index 9853188e..d56669d6 100644 --- a/etl/pom.xml +++ b/etl/pom.xml @@ -327,19 +327,19 @@ - buildGenomicDatasetMerger + GenomicDatasetMergerRunner - edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.GenomicDatasetMerger + edu.harvard.hms.dbmi.avillach.hpds.etl.genotype.GenomicDatasetMergerRunner ${project.basedir}/../docker/pic-sure-hpds-etl jar-with-dependencies - GenomicDatasetMerger - GenomicDatasetMerger + GenomicDatasetMergerRunner + GenomicDatasetMergerRunner package From 3bb4fe08e250d3c5e213a302df8d283fbb311a9a Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 21 Aug 2023 11:12:14 -0400 Subject: [PATCH 36/39] ALS-4461: Add main args validation --- .../hpds/etl/genotype/GenomicDatasetMergerRunner.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java index 8186a350..70565730 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMergerRunner.java @@ -1,5 +1,6 @@ package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; +import com.google.common.base.Preconditions; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; @@ -33,7 +34,9 @@ public class GenomicDatasetMergerRunner { * args[2]: output directory */ public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { - // todo :validation + if (args.length != 3) { + throw new IllegalArgumentException("Three arguments must be provided: source directory 1, source directory 2, output directory"); + } genomicDirectory1 = args[0]; genomicDirectory2 = args[1]; String outputDirectory = args[2]; From c2a0f38c4f5faa31f1df73ea0a5853291981d9c7 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Mon, 21 Aug 2023 14:09:14 -0400 Subject: [PATCH 37/39] ALS-4461: Remove unused method --- .../avillach/hpds/etl/genotype/GenomicDatasetMerger.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 35170967..e66f0c30 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -55,13 +55,6 @@ private void validate() { } } - public VariantStore merge() throws IOException { - Map>> mergedChromosomeMasks = mergeChromosomeMasks(); - VariantStore mergedVariantStore = mergeVariantStore(mergedChromosomeMasks); - mergeVariantIndexes(); - return mergedVariantStore; - } - public VariantStore mergeVariantStore(Map>> mergedChromosomeMasks) { mergedVariantStore.setVariantMaskStorage(mergedChromosomeMasks); mergedVariantStore.setPatientIds(mergePatientIds()); From ed9a5d48c45ff89aaec88fb5c3a56f48a5a3a868 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 23 Aug 2023 09:24:03 -0400 Subject: [PATCH 38/39] ALS-4461: Remove unused classes --- .../hpds/etl/genotype/MultialleleCounter.java | 44 ------------------- .../hpds/etl/genotype/VariantCounter.java | 28 ------------ 2 files changed, 72 deletions(-) delete mode 100644 etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java delete mode 100644 etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java deleted file mode 100644 index fec6a83c..00000000 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/MultialleleCounter.java +++ /dev/null @@ -1,44 +0,0 @@ -package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; - -import java.io.*; -import java.util.ArrayList; -import java.util.TreeSet; -import java.util.concurrent.ConcurrentHashMap; - -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantSpec; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; -import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; - -public class MultialleleCounter { - - public static void main(String[] args) throws ClassNotFoundException, IOException { - VariantStore variantStore = VariantStore.readInstance("/opt/local/hpds/all/"); - for (String contig : variantStore.getVariantMaskStorage().keySet()) { - System.out.println("Starting contig : " + contig); - FileBackedByteIndexedStorage> - currentChromosome = variantStore.getVariantMaskStorage().get(contig); - currentChromosome.keys().parallelStream().forEach((offsetBucket) -> { - System.out.println("Starting bucket : " + offsetBucket); - ConcurrentHashMap maskMap; - maskMap = currentChromosome.get(offsetBucket); - - TreeSet variantsSortedByOffset = new TreeSet<>(); - for (String variant : maskMap.keySet()) { - variantsSortedByOffset.add(new VariantSpec(variant)); - } - ArrayList variantsSortedByOffsetList = new ArrayList<>(variantsSortedByOffset); - for (int y = 1; y < variantsSortedByOffsetList.size(); y++) { - if (variantsSortedByOffsetList.get(y).metadata.offset.equals(variantsSortedByOffsetList.get(y - 1).metadata.offset)) { - try { - System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation() + ":" + maskMap.get(variantsSortedByOffsetList.get(y - 1).specNotation()).heterozygousMask.toString(2) + ":" + ":" + maskMap.get(variantsSortedByOffsetList.get(y).specNotation()).heterozygousMask.toString(2)); - } catch (NullPointerException e) { - System.out.println("Matching offsets : " + variantsSortedByOffsetList.get(y - 1).specNotation() + " : " + variantsSortedByOffsetList.get(y).specNotation()); - } - } - } - System.out.println("Completed bucket : " + offsetBucket); - }); - } - } -} diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java deleted file mode 100644 index 4d22c572..00000000 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/VariantCounter.java +++ /dev/null @@ -1,28 +0,0 @@ -package edu.harvard.hms.dbmi.avillach.hpds.etl.genotype; - -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; -import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; - -import java.io.IOException; -import java.util.concurrent.ConcurrentHashMap; - -public class VariantCounter { - - public static void main(String[] args) throws ClassNotFoundException, IOException { - VariantStore variantStore = VariantStore.readInstance("/opt/local/hpds/all/"); - for(String contig : variantStore.getVariantMaskStorage().keySet()) { - int[] countOfVariants = {0}; - FileBackedByteIndexedStorage> - currentChromosome = variantStore.getVariantMaskStorage().get(contig); - currentChromosome.keys().parallelStream().forEach((offsetBucket)->{ - ConcurrentHashMap maskMap; - maskMap = currentChromosome.get(offsetBucket); - if(maskMap!=null) { - countOfVariants[0]+=maskMap.size(); - } - }); - System.out.println(contig + "," + countOfVariants[0]); - } - } -} From f6ea975d4923b2386edddb33c2d70ee8e263f458 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 23 Aug 2023 09:34:10 -0400 Subject: [PATCH 39/39] ALS-4461: Remove potential race condition --- .../avillach/hpds/storage/FileBackedByteIndexedStorage.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java index 2b930927..1ecc466e 100644 --- a/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java +++ b/common/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/storage/FileBackedByteIndexedStorage.java @@ -73,8 +73,9 @@ public void complete() { public V get(K key) { try { - if(this.storage==null) { - synchronized(this) { + // todo: make this class immutable and remove this lock/check altogether + synchronized(this) { + if(this.storage==null) { this.open(); } }