LabKey · bbimber · Oct 9, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/model/ReadData.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/model/ReadData.java
@@ -61,4 +61,6 @@ public interface ReadData extends Serializable
     String getSra_accession();
 
     boolean isArchived();
+
+    boolean isPairedEnd();
 }
diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/ReferenceGenome.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/ReferenceGenome.java
@@ -15,6 +15,7 @@
  */
 package org.labkey.api.sequenceanalysis.pipeline;
 
+import htsjdk.samtools.SAMSequenceDictionary;
 import org.jetbrains.annotations.NotNull;
 
 import java.io.File;
@@ -87,6 +88,8 @@ public interface ReferenceGenome extends Serializable
      */
     File getSequenceDictionary();
 
+    SAMSequenceDictionary extractDictionary();
+
     /**
      * @return True if this is a genome not defined in the main database, such as a job using an ad hoc FASTA file or genome based on querying the NT records
      */

diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java
@@ -51,7 +51,8 @@ enum ScatterGatherMethod
         none(false),
         contig(false),
         chunked(true),
-        fixedJobs(false);
+        fixedJobs(false),
+        specificInternals(false);
 
         private final boolean _mayRequireSort;
 

diff --git a/SequenceAnalysis/resources/web/SequenceAnalysis/panel/AnalysisSectionPanel.js b/SequenceAnalysis/resources/web/SequenceAnalysis/panel/AnalysisSectionPanel.js
@@ -211,7 +211,14 @@ Ext4.define('SequenceAnalysis.panel.AnalysisSectionPanel', {
                 handler: function(btn){
                     btn.up('window').close();
                 }
-            }]
+            }],
+            listeners: {
+                show: function(win){
+                    if (win.getHeight() > Ext4.getBody().getHeight()) {
+                        win.alignTo(Ext4.getBody(), 't-t?');
+                    }
+                }
+            }
         }).show(btn);
     },
 

diff --git a/SequenceAnalysis/resources/web/SequenceAnalysis/panel/VariantScatterGatherPanel.js b/SequenceAnalysis/resources/web/SequenceAnalysis/panel/VariantScatterGatherPanel.js
@@ -87,6 +87,17 @@ Ext4.define('SequenceAnalysis.panel.VariantScatterGatherPanel', {
                                 value: 10
                             });
                         }
+                        else if (val === 'specificIntervals') {
+                            toAdd.push({
+                                xtype: 'sequenceanalysis-intervalfield',
+                                labelWidth: this.labelWidth,
+                                name: 'scatterGather.specificIntervals',
+                                label: 'Intervals to Process',
+                                helpPopup: 'The intervals to process. They should be in the form: chr01:102-20394',
+                                allowBlank: false,
+                                defaultValue: null
+                            });
+                        }
 
                         if (toAdd.length) {
                             panel.add(toAdd);

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/ReadDataImpl.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/ReadDataImpl.java
@@ -307,10 +307,11 @@ public void cacheForRemoteServer()
         }
     }
 
+    @Override
     @Transient
     public boolean isPairedEnd()
     {
-        return getFile2() != null;
+        return getFileId2() != null;
     }
 
     @Override

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GenotypeGVCFHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GenotypeGVCFHandler.java
@@ -36,7 +36,6 @@
 import org.labkey.api.writer.PrintWriters;
 import org.labkey.sequenceanalysis.ScatterGatherUtils;
 import org.labkey.sequenceanalysis.SequenceAnalysisModule;
-import org.labkey.sequenceanalysis.pipeline.JobContextImpl;
 import org.labkey.sequenceanalysis.pipeline.ProcessVariantsHandler;
 import org.labkey.sequenceanalysis.pipeline.VariantProcessingJob;
 import org.labkey.sequenceanalysis.run.util.AbstractGenomicsDBImportHandler;
@@ -50,7 +49,6 @@
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/OrphanFilePipelineJob.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/OrphanFilePipelineJob.java
@@ -161,6 +161,7 @@ public boolean isJobComplete(PipelineJob job)
             knownExpDatas.addAll(new TableSelector(us.getTable(SequenceAnalysisSchema.TABLE_READ_DATA, null), PageFlowUtil.set("fileid2"),null, null).getArrayList(Integer.class));
             knownExpDatas.addAll(new TableSelector(us.getTable(SequenceAnalysisSchema.TABLE_ANALYSES, null), PageFlowUtil.set("alignmentfile"),null, null).getArrayList(Integer.class));
             knownExpDatas.addAll(new TableSelector(us.getTable(SequenceAnalysisSchema.TABLE_OUTPUTFILES, null), PageFlowUtil.set("dataId"),null, null).getArrayList(Integer.class));
+            knownExpDatas.remove(null);
             knownExpDatas = Collections.unmodifiableSet(knownExpDatas);
             //messages.add("## total registered sequence ExpData: " + knownExpDatas.size());
 
@@ -239,7 +240,7 @@ public boolean isJobComplete(PipelineJob job)
                     writer.println("set -e");
                     writer.println("set -x");
                     writer.println("");
-                    probableDeletes.forEach(f -> writer.println("rm -Rf " + f.getPath()));
+                    probableDeletes.forEach(f -> writer.println("rm -Rf '" + f.getPath() + "'"));
                 }
                 catch (IOException e)
                 {
@@ -541,6 +542,14 @@ private void getOrphanFilesForDirectory(Set<Integer> knownExpDatas, Map<URI, Set
 
                 if (f.isDirectory())
                 {
+                    if (f.getName().endsWith(".gdb"))
+                    {
+                        if (!dataMap.containsKey(new File(f, "__tiledb_workspace.tdb").toURI()))
+                        {
+                            orphanSequenceFiles.add(f);
+                        }
+                    }
+
                     getOrphanFilesForDirectory(knownExpDatas, dataMap, f, orphanSequenceFiles, orphanIndexes);
                 }
                 else

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/ReferenceGenomeImpl.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/ReferenceGenomeImpl.java
@@ -1,6 +1,8 @@
 package org.labkey.sequenceanalysis.pipeline;
 
 import com.fasterxml.jackson.annotation.JsonIgnore;
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
 import org.apache.logging.log4j.Logger;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;
@@ -110,6 +112,13 @@ public File getSequenceDictionary()
         return getWorkingFastaFile() == null ? null : new File(FileUtil.getBaseName(getWorkingFastaFile().getPath()) + ".dict");
     }
 
+    @JsonIgnore
+    @Override
+    public SAMSequenceDictionary extractDictionary()
+    {
+        return getSequenceDictionary() == null ? null : SAMSequenceDictionaryExtractor.extractDictionary(getSequenceDictionary().toPath());
+    }
+
     @Override
     public String getName()
     {

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/SequenceAlignmentTask.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/SequenceAlignmentTask.java
@@ -1941,7 +1941,7 @@ else if (sraIDs.contains(rd.getSra_accession()))
                 RestoreSraDataHandler.FastqDumpWrapper sra = new RestoreSraDataHandler.FastqDumpWrapper(getJob().getLogger());
                 if (doneFile.exists())
                 {
-                    rdi.setFile(new File(outDir, rd.getSra_accession() + "_1.fastq.gz"), 1);
+                    rdi.setFile(new File(outDir, rd.getSra_accession() + (rd.isPairedEnd() ? "_1" : "") + ".fastq.gz"), 1);
                     if (rd.getFileId2() != null)
                     {
                         rdi.setFile(new File(outDir, rd.getSra_accession() + "_2.fastq.gz"), 2);
@@ -1954,7 +1954,7 @@ else if (sraIDs.contains(rd.getSra_accession()))
                         outDir.mkdirs();
                     }
 
-                    Pair<File, File> downloaded = sra.downloadSra(rd.getSra_accession(), outDir);
+                    Pair<File, File> downloaded = sra.downloadSra(rd.getSra_accession(), outDir, rd.isPairedEnd());
                     rdi.setFile(downloaded.first, 1);
                     rdi.setFile(downloaded.second, 2);
 

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingJob.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingJob.java
@@ -37,6 +37,7 @@
 import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
 import org.labkey.api.writer.PrintWriters;
 import org.labkey.sequenceanalysis.util.ScatterGatherUtils;
+import org.labkey.sequenceanalysis.util.SequenceUtil;
 
 import java.io.File;
 import java.io.IOException;
@@ -164,6 +165,22 @@ else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.fixed
             getLogger().info("Creating " + numJobs + " jobs with approximate size: " + jobSize + " bp.");
             ret = ScatterGatherUtils.divideGenome(dict, jobSize, true, -1, false);
         }
+        else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.specificInternals)
+        {
+            try
+            {
+                String intervalsRaw = StringUtils.trimToNull(getParameterJson().getString("scatterGather.specificIntervals"));
+                String[] intervals = intervalsRaw.split(";");
+                List<Interval> values = SequenceUtil.validateAndParseIntervals(intervals, dict);
+
+                ret = new LinkedHashMap<>();
+                ret.put("Job1", values);
+            }
+            catch (PipelineJobException e)
+            {
+                throw new IllegalArgumentException(e);
+            }
+        }
         else
         {
             throw new IllegalArgumentException("Unknown scatter type: " + _scatterGatherMethod.name());

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/RestoreSraDataHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/RestoreSraDataHandler.java
@@ -384,7 +384,7 @@ public void processFilesRemote(List<Readset> readsets, JobContext ctx) throws Un
                     File expectedFile2 = rd.getFileId2() == null ? null : ctx.getSequenceSupport().getCachedData(rd.getFileId2());
 
                     FastqDumpWrapper wrapper = new FastqDumpWrapper(ctx.getLogger());
-                    Pair<File, File> files = wrapper.downloadSra(accession, ctx.getOutputDir());
+                    Pair<File, File> files = wrapper.downloadSra(accession, ctx.getOutputDir(), rd.isPairedEnd());
 
                     long lines1 = SequenceUtil.getLineCount(files.first) / 4;
                     ctx.getJob().getLogger().debug("Reads in " + files.first.getName() + ": " + lines1);
@@ -459,7 +459,7 @@ public FastqDumpWrapper(@Nullable Logger logger)
             super(logger);
         }
 
-        public Pair<File, File> downloadSra(String dataset, File outDir) throws PipelineJobException
+        public Pair<File, File> downloadSra(String dataset, File outDir, boolean expectPaired) throws PipelineJobException
         {
             List<String> args = new ArrayList<>();
             args.add(getExe().getPath());
@@ -491,7 +491,7 @@ public Pair<File, File> downloadSra(String dataset, File outDir) throws Pipeline
 
             List<File> files = new ArrayList<>(Arrays.asList(Objects.requireNonNull(outDir.listFiles((dir, name) -> name.startsWith(dataset)))));
 
-            File file1 = new File(outDir, dataset + "_1.fastq");
+            File file1 = new File(outDir, dataset + (expectPaired ? "_1" : "") + ".fastq");
             if (!file1.exists())
             {
                 throw new PipelineJobException("Missing file: " + file1.getPath());
@@ -500,7 +500,12 @@ public Pair<File, File> downloadSra(String dataset, File outDir) throws Pipeline
             files.remove(file1);
 
             File file2 = new File(outDir, dataset + "_2.fastq");
-            if (!file2.exists())
+            if (expectPaired & !file2.exists())
+            {
+                throw new PipelineJobException("Missing file: " + file2.getPath());
+            }
+
+            if (!expectPaired)
             {
                 file2 = null;
             }

diff --git a/...nceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SequenceBasedTypingAnalysis.java b/...nceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SequenceBasedTypingAnalysis.java
@@ -271,16 +271,17 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
             BamIterator bi = new BamIterator(inputBam, referenceGenome.getWorkingFastaFile(), getPipelineCtx().getLogger());
 
             List<AlignmentAggregator> aggregators = new ArrayList<>();
+            File workDir = new File(getPipelineCtx().getSourceDirectory(), FileUtil.getBaseName(inputBam));
+            File sbtOutputLog = new File(workDir, FileUtil.getBaseName(inputBam) + ".sbt.txt.gz");
+
             SequenceBasedTypingAlignmentAggregator agg = new SequenceBasedTypingAlignmentAggregator(getPipelineCtx().getLogger(), referenceGenome.getWorkingFastaFile(), avgBaseQualityAggregator, toolParams);
             if (getProvider().getParameterByName("writeLog").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false))
             {
-                File workDir = new File(getPipelineCtx().getSourceDirectory(), FileUtil.getBaseName(inputBam));
                 if (!workDir.exists())
                 {
                     workDir.mkdirs();
                 }
-                File outputLog = new File(workDir, FileUtil.getBaseName(inputBam) + ".sbt.txt.gz");
-                agg.setOutputLog(outputLog);
+                agg.setOutputLog(sbtOutputLog);
             }
 
             File lineageMapFile = new File(getPipelineCtx().getSourceDirectory(), referenceGenome.getGenomeId() + "_lineageMap.txt");
@@ -311,6 +312,8 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
             //write output as TSV
             agg.writeTable(getSBTSummaryFile(outputDir, inputBam));
 
+            output.addSequenceOutput(sbtOutputLog, "SBT Results: " + inputBam.getName(), "SBT Results", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
+
             //optionally output FASTQ of unmapped reads
             Double exportThreshold = getProvider().getParameterByName(EXPORT_UNMAPPED).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Double.class);
             if (exportThreshold != null)

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/DepthOfCoverageHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/DepthOfCoverageHandler.java
@@ -116,7 +116,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
             if (intervalString != null)
             {
                 String[] intervals = intervalString.split(";");
-                validateIntervals(intervals);
+                SequenceUtil.validateAndParseIntervals(intervals, rg.extractDictionary());
                 for (String i : intervals)
                 {
                     extraArgs.add("-L");
@@ -225,25 +225,4 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
             ctx.getFileManager().addSequenceOutput(so);
         }
     }
-
-    public static void validateIntervals(String[] intervals) throws PipelineJobException
-    {
-        for (String i : intervals)
-        {
-            //NOTE: the contig name can contain hyphen..
-            String[] tokens = i.split(":");
-            if (tokens.length > 2)
-            {
-                throw new PipelineJobException("Invalid interval: " + i);
-            }
-            else if (tokens.length == 2)
-            {
-                String[] coords = tokens[1].split("-");
-                if (coords.length != 2)
-                {
-                    throw new PipelineJobException("Invalid interval: " + i);
-                }
-            }
-        }
-    }
 }
diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java
@@ -5,6 +5,8 @@
 import htsjdk.samtools.SAMReadGroupRecord;
 import htsjdk.samtools.SAMRecord;
 import htsjdk.samtools.SAMRecordIterator;
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.SAMSequenceRecord;
 import htsjdk.samtools.SamReader;
 import htsjdk.samtools.SamReaderFactory;
 import htsjdk.samtools.ValidationStringency;
@@ -661,4 +663,45 @@ public static String getLegalReadGroupName(String rsName)
     {
         return rsName.replaceAll(" ", "_");
     }
+
+    public static List<Interval> validateAndParseIntervals(String[] intervals, SAMSequenceDictionary dict) throws PipelineJobException
+    {
+        List<Interval> ret = new ArrayList<>();
+        for (String i : intervals)
+        {
+            String contig;
+            //NOTE: the contig name can contain hyphen..
+            String[] tokens = i.split(":");
+            if (tokens.length > 2)
+            {
+                throw new PipelineJobException("Invalid interval: " + i);
+            }
+
+            if (tokens.length == 2)
+            {
+                String[] coords = tokens[1].split("-");
+                if (coords.length != 2)
+                {
+                    throw new PipelineJobException("Invalid interval: " + i);
+                }
+
+                int start = Integer.parseInt(coords[0]);
+                int end = Integer.parseInt(coords[1]);
+
+                ret.add(new Interval(tokens[0], start, end));
+            }
+            else
+            {
+                SAMSequenceRecord rec = dict.getSequence(tokens[0]);
+                if (rec == null)
+                {
+                    throw new PipelineJobException("Unable to find sequence: " + tokens[0]);
+                }
+
+                ret.add(new Interval(tokens[0], 1, rec.getSequenceLength()));
+            }
+        }
+
+        return ret;
+    }
 }