Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge 24.7 to develop #319

Merged
merged 14 commits into from
Oct 9, 2024
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,6 @@ public interface ReadData extends Serializable
String getSra_accession();

boolean isArchived();

boolean isPairedEnd();
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
package org.labkey.api.sequenceanalysis.pipeline;

import htsjdk.samtools.SAMSequenceDictionary;
import org.jetbrains.annotations.NotNull;

import java.io.File;
Expand Down Expand Up @@ -87,6 +88,8 @@ public interface ReferenceGenome extends Serializable
*/
File getSequenceDictionary();

SAMSequenceDictionary extractDictionary();

/**
* @return True if this is a genome not defined in the main database, such as a job using an ad hoc FASTA file or genome based on querying the NT records
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ enum ScatterGatherMethod
none(false),
contig(false),
chunked(true),
fixedJobs(false);
fixedJobs(false),
specificInternals(false);

private final boolean _mayRequireSort;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,14 @@ Ext4.define('SequenceAnalysis.panel.AnalysisSectionPanel', {
handler: function(btn){
btn.up('window').close();
}
}]
}],
listeners: {
show: function(win){
if (win.getHeight() > Ext4.getBody().getHeight()) {
win.alignTo(Ext4.getBody(), 't-t?');
}
}
}
}).show(btn);
},

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ Ext4.define('SequenceAnalysis.panel.VariantScatterGatherPanel', {
value: 10
});
}
else if (val === 'specificIntervals') {
toAdd.push({
xtype: 'sequenceanalysis-intervalfield',
labelWidth: this.labelWidth,
name: 'scatterGather.specificIntervals',
label: 'Intervals to Process',
helpPopup: 'The intervals to process. They should be in the form: chr01:102-20394',
allowBlank: false,
defaultValue: null
});
}

if (toAdd.length) {
panel.add(toAdd);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,11 @@ public void cacheForRemoteServer()
}
}

@Override
@Transient
public boolean isPairedEnd()
{
return getFile2() != null;
return getFileId2() != null;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import org.labkey.api.writer.PrintWriters;
import org.labkey.sequenceanalysis.ScatterGatherUtils;
import org.labkey.sequenceanalysis.SequenceAnalysisModule;
import org.labkey.sequenceanalysis.pipeline.JobContextImpl;
import org.labkey.sequenceanalysis.pipeline.ProcessVariantsHandler;
import org.labkey.sequenceanalysis.pipeline.VariantProcessingJob;
import org.labkey.sequenceanalysis.run.util.AbstractGenomicsDBImportHandler;
Expand All @@ -50,7 +49,6 @@
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ public boolean isJobComplete(PipelineJob job)
knownExpDatas.addAll(new TableSelector(us.getTable(SequenceAnalysisSchema.TABLE_READ_DATA, null), PageFlowUtil.set("fileid2"),null, null).getArrayList(Integer.class));
knownExpDatas.addAll(new TableSelector(us.getTable(SequenceAnalysisSchema.TABLE_ANALYSES, null), PageFlowUtil.set("alignmentfile"),null, null).getArrayList(Integer.class));
knownExpDatas.addAll(new TableSelector(us.getTable(SequenceAnalysisSchema.TABLE_OUTPUTFILES, null), PageFlowUtil.set("dataId"),null, null).getArrayList(Integer.class));
knownExpDatas.remove(null);
knownExpDatas = Collections.unmodifiableSet(knownExpDatas);
//messages.add("## total registered sequence ExpData: " + knownExpDatas.size());

Expand Down Expand Up @@ -239,7 +240,7 @@ public boolean isJobComplete(PipelineJob job)
writer.println("set -e");
writer.println("set -x");
writer.println("");
probableDeletes.forEach(f -> writer.println("rm -Rf " + f.getPath()));
probableDeletes.forEach(f -> writer.println("rm -Rf '" + f.getPath() + "'"));
}
catch (IOException e)
{
Expand Down Expand Up @@ -541,6 +542,14 @@ private void getOrphanFilesForDirectory(Set<Integer> knownExpDatas, Map<URI, Set

if (f.isDirectory())
{
if (f.getName().endsWith(".gdb"))
{
if (!dataMap.containsKey(new File(f, "__tiledb_workspace.tdb").toURI()))
{
orphanSequenceFiles.add(f);
}
}

getOrphanFilesForDirectory(knownExpDatas, dataMap, f, orphanSequenceFiles, orphanIndexes);
}
else
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.labkey.sequenceanalysis.pipeline;

import com.fasterxml.jackson.annotation.JsonIgnore;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import org.apache.logging.log4j.Logger;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
Expand Down Expand Up @@ -110,6 +112,13 @@ public File getSequenceDictionary()
return getWorkingFastaFile() == null ? null : new File(FileUtil.getBaseName(getWorkingFastaFile().getPath()) + ".dict");
}

@JsonIgnore
@Override
public SAMSequenceDictionary extractDictionary()
{
return getSequenceDictionary() == null ? null : SAMSequenceDictionaryExtractor.extractDictionary(getSequenceDictionary().toPath());
}

@Override
public String getName()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1941,7 +1941,7 @@ else if (sraIDs.contains(rd.getSra_accession()))
RestoreSraDataHandler.FastqDumpWrapper sra = new RestoreSraDataHandler.FastqDumpWrapper(getJob().getLogger());
if (doneFile.exists())
{
rdi.setFile(new File(outDir, rd.getSra_accession() + "_1.fastq.gz"), 1);
rdi.setFile(new File(outDir, rd.getSra_accession() + (rd.isPairedEnd() ? "_1" : "") + ".fastq.gz"), 1);
if (rd.getFileId2() != null)
{
rdi.setFile(new File(outDir, rd.getSra_accession() + "_2.fastq.gz"), 2);
Expand All @@ -1954,7 +1954,7 @@ else if (sraIDs.contains(rd.getSra_accession()))
outDir.mkdirs();
}

Pair<File, File> downloaded = sra.downloadSra(rd.getSra_accession(), outDir);
Pair<File, File> downloaded = sra.downloadSra(rd.getSra_accession(), outDir, rd.isPairedEnd());
rdi.setFile(downloaded.first, 1);
rdi.setFile(downloaded.second, 2);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
import org.labkey.api.writer.PrintWriters;
import org.labkey.sequenceanalysis.util.ScatterGatherUtils;
import org.labkey.sequenceanalysis.util.SequenceUtil;

import java.io.File;
import java.io.IOException;
Expand Down Expand Up @@ -164,6 +165,22 @@ else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.fixed
getLogger().info("Creating " + numJobs + " jobs with approximate size: " + jobSize + " bp.");
ret = ScatterGatherUtils.divideGenome(dict, jobSize, true, -1, false);
}
else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.specificInternals)
{
try
{
String intervalsRaw = StringUtils.trimToNull(getParameterJson().getString("scatterGather.specificIntervals"));
String[] intervals = intervalsRaw.split(";");
List<Interval> values = SequenceUtil.validateAndParseIntervals(intervals, dict);

ret = new LinkedHashMap<>();
ret.put("Job1", values);
}
catch (PipelineJobException e)
{
throw new IllegalArgumentException(e);
}
}
else
{
throw new IllegalArgumentException("Unknown scatter type: " + _scatterGatherMethod.name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ public void processFilesRemote(List<Readset> readsets, JobContext ctx) throws Un
File expectedFile2 = rd.getFileId2() == null ? null : ctx.getSequenceSupport().getCachedData(rd.getFileId2());

FastqDumpWrapper wrapper = new FastqDumpWrapper(ctx.getLogger());
Pair<File, File> files = wrapper.downloadSra(accession, ctx.getOutputDir());
Pair<File, File> files = wrapper.downloadSra(accession, ctx.getOutputDir(), rd.isPairedEnd());

long lines1 = SequenceUtil.getLineCount(files.first) / 4;
ctx.getJob().getLogger().debug("Reads in " + files.first.getName() + ": " + lines1);
Expand Down Expand Up @@ -459,7 +459,7 @@ public FastqDumpWrapper(@Nullable Logger logger)
super(logger);
}

public Pair<File, File> downloadSra(String dataset, File outDir) throws PipelineJobException
public Pair<File, File> downloadSra(String dataset, File outDir, boolean expectPaired) throws PipelineJobException
{
List<String> args = new ArrayList<>();
args.add(getExe().getPath());
Expand Down Expand Up @@ -491,7 +491,7 @@ public Pair<File, File> downloadSra(String dataset, File outDir) throws Pipeline

List<File> files = new ArrayList<>(Arrays.asList(Objects.requireNonNull(outDir.listFiles((dir, name) -> name.startsWith(dataset)))));

File file1 = new File(outDir, dataset + "_1.fastq");
File file1 = new File(outDir, dataset + (expectPaired ? "_1" : "") + ".fastq");
if (!file1.exists())
{
throw new PipelineJobException("Missing file: " + file1.getPath());
Expand All @@ -500,7 +500,12 @@ public Pair<File, File> downloadSra(String dataset, File outDir) throws Pipeline
files.remove(file1);

File file2 = new File(outDir, dataset + "_2.fastq");
if (!file2.exists())
if (expectPaired & !file2.exists())
{
throw new PipelineJobException("Missing file: " + file2.getPath());
}

if (!expectPaired)
{
file2 = null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,16 +271,17 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
BamIterator bi = new BamIterator(inputBam, referenceGenome.getWorkingFastaFile(), getPipelineCtx().getLogger());

List<AlignmentAggregator> aggregators = new ArrayList<>();
File workDir = new File(getPipelineCtx().getSourceDirectory(), FileUtil.getBaseName(inputBam));
File sbtOutputLog = new File(workDir, FileUtil.getBaseName(inputBam) + ".sbt.txt.gz");

SequenceBasedTypingAlignmentAggregator agg = new SequenceBasedTypingAlignmentAggregator(getPipelineCtx().getLogger(), referenceGenome.getWorkingFastaFile(), avgBaseQualityAggregator, toolParams);
if (getProvider().getParameterByName("writeLog").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false))
{
File workDir = new File(getPipelineCtx().getSourceDirectory(), FileUtil.getBaseName(inputBam));
if (!workDir.exists())
{
workDir.mkdirs();
}
File outputLog = new File(workDir, FileUtil.getBaseName(inputBam) + ".sbt.txt.gz");
agg.setOutputLog(outputLog);
agg.setOutputLog(sbtOutputLog);
}

File lineageMapFile = new File(getPipelineCtx().getSourceDirectory(), referenceGenome.getGenomeId() + "_lineageMap.txt");
Expand Down Expand Up @@ -311,6 +312,8 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
//write output as TSV
agg.writeTable(getSBTSummaryFile(outputDir, inputBam));

output.addSequenceOutput(sbtOutputLog, "SBT Results: " + inputBam.getName(), "SBT Results", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);

//optionally output FASTQ of unmapped reads
Double exportThreshold = getProvider().getParameterByName(EXPORT_UNMAPPED).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Double.class);
if (exportThreshold != null)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
if (intervalString != null)
{
String[] intervals = intervalString.split(";");
validateIntervals(intervals);
SequenceUtil.validateAndParseIntervals(intervals, rg.extractDictionary());
for (String i : intervals)
{
extraArgs.add("-L");
Expand Down Expand Up @@ -225,25 +225,4 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
ctx.getFileManager().addSequenceOutput(so);
}
}

public static void validateIntervals(String[] intervals) throws PipelineJobException
{
for (String i : intervals)
{
//NOTE: the contig name can contain hyphen..
String[] tokens = i.split(":");
if (tokens.length > 2)
{
throw new PipelineJobException("Invalid interval: " + i);
}
else if (tokens.length == 2)
{
String[] coords = tokens[1].split("-");
if (coords.length != 2)
{
throw new PipelineJobException("Invalid interval: " + i);
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
Expand Down Expand Up @@ -661,4 +663,45 @@ public static String getLegalReadGroupName(String rsName)
{
return rsName.replaceAll(" ", "_");
}

public static List<Interval> validateAndParseIntervals(String[] intervals, SAMSequenceDictionary dict) throws PipelineJobException
{
List<Interval> ret = new ArrayList<>();
for (String i : intervals)
{
String contig;
//NOTE: the contig name can contain hyphen..
String[] tokens = i.split(":");
if (tokens.length > 2)
{
throw new PipelineJobException("Invalid interval: " + i);
}

if (tokens.length == 2)
{
String[] coords = tokens[1].split("-");
if (coords.length != 2)
{
throw new PipelineJobException("Invalid interval: " + i);
}

int start = Integer.parseInt(coords[0]);
int end = Integer.parseInt(coords[1]);

ret.add(new Interval(tokens[0], start, end));
}
else
{
SAMSequenceRecord rec = dict.getSequence(tokens[0]);
if (rec == null)
{
throw new PipelineJobException("Unable to find sequence: " + tokens[0]);
}

ret.add(new Interval(tokens[0], 1, rec.getSequenceLength()));
}
}

return ret;
}
}
Loading