From f363406f8f87c0ae0c7b9567a959ad63b9dcc5af Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 25 Apr 2023 16:23:50 +0200 Subject: [PATCH 01/14] automated P5 RC in Miseqs run if all samples are empty --- .pre-commit-config.yaml | 2 +- ChangeLog | 4 +++ src/dissectBCL/classes.py | 6 +++- src/dissectBCL/demux.py | 63 ++++++++++++++++++++++----------------- src/dissectBCL/dissect.py | 1 + src/dissectBCL/drHouse.py | 60 ++++++++++++++++++++++++++++++++++++- 6 files changed, 105 insertions(+), 31 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7c0cdc7..9a9fe5a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,4 @@ repos: rev: "6.0.0" hooks: - id: flake8 - exclude: ^(build|docs|tests) + exclude: ^(build|docs|tests) \ No newline at end of file diff --git a/ChangeLog b/ChangeLog index e49ad1c..3f78b30 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,10 @@ CHANGES ======= +v0.2.3 +------ + +* sometimes projects are omitted from a flow cell (budgetary regions). Escape Nonetypes in the mqc building * escape nones in libtypes * include non-ommitted empty samples in kraken for email * flake fix/precommit version boost diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 1280253..2f33c18 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -402,6 +402,9 @@ def spaceGood(freeSpace): message += "return {}: {}\n".format( subkey, self.exitStats[key][subkey] ) + if self.P5RC: + message += "\nNote that the P5s have been reverse complemented automatically!\n" + # undetermined table undtableHead = ["P7", "P5", "# reads (M)", "% of und. Reads"] undtableCont = [] @@ -502,7 +505,8 @@ def __init__( barcodeMask, mismatch, transferTime, - exitStats + exitStats, + P5RC ): self.undetermined = undetermined self.totalReads = totalReads diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 42c2e1d..9ba1ec1 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -1,6 +1,7 @@ from dissectBCL.misc import joinLis, hamming, lenMask from dissectBCL.misc import P5Seriesret, matchingSheets from dissectBCL.fakeNews import mailHome +from dissectBCL.drHouse import differentialDiagnosis from itertools import combinations import os from subprocess import Popen, PIPE @@ -9,6 +10,7 @@ import pandas as pd import numpy as np import logging +import shutil def hamming2Mismatch(minVal): @@ -56,7 +58,8 @@ def detMask(seqRecipe, sampleSheetDF, outputFolder): scATACl = [ "scATAC-Seq 10xGenomics", "NextGEM_Multiome_ATAC", - "Next GEM Single Cell ATAC" + "Next GEM Single Cell ATAC", + "MUXscATAC-seq v3" ] # initialize variables mask = [] @@ -532,36 +535,39 @@ def demux(sampleSheet, flowcell, config): Path( os.path.join(outputFolder, 'bclconvert.done') ).touch() - elif exitcode == -6: - # known bug async, related to I/O lag over network (?). - # Illumina tech support wasn't sure, ticket still open. - bclRunner = Popen( - bclOpts, - stdout=PIPE - ) - exitcode = bclRunner.wait() - if exitcode == 0: - logging.info( - "bclConvert exit {} after second try.".format( - exitcode + if flowcell.sequencer == 'MiSeq': + if differentialDiagnosis( + outputFolder, + sampleSheet.ssDic[outLane]['dualIx'], + ): + logging.info("P5 RC triggered.") + # Purge existing reports. + logging.info("Purge existing Reports folder") + shutil.rmtree( + os.path.join(outputFolder, 'Reports') ) - ) - Path( - os.path.join(outputFolder, 'bclconvert.done') - ).touch() + # + bclRunner = Popen( + bclOpts, + stdout=PIPE + ) + exitcode = bclRunner.wait() + logging.info( + "bclConvert P5fix exit {}".format(exitcode) + ) + # Update the sampleSheet with proper RC'ed indices. + manual_mask, manual_df, manual_dualIx, man_mmdic = readDemuxSheet( + demuxOut + ) + sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( + sampleSheet.ssDic[outLane]['sampleSheet'], + manual_df + ) + sampleSheet.ssDic[outLane]['P5RC'] = True else: - logging.critical("bclConvert exit {}".format(exitcode)) - mailHome( - outLane, - 'BCL-convert exit {}. Investigate.'.format( - exitcode - ), - config, - toCore=True - ) - sys.exit(1) + sampleSheet.ssDic[outLane]['P5RC'] = False else: - logging.critical("bclConvert exit {}".format(exitcode)) + logging.critical("bclConvert exit {}".format(exitcode)) mailHome( outLane, 'BCL-convert exit {}. Investigate.'.format( @@ -571,6 +577,7 @@ def demux(sampleSheet, flowcell, config): toCore=True ) sys.exit(1) + logging.info("Parsing stats for {}".format(outLane)) sampleSheet.ssDic[outLane]['sampleSheet'] = parseStats( outputFolder, diff --git a/src/dissectBCL/dissect.py b/src/dissectBCL/dissect.py index 6ecccf1..9a7e5e1 100644 --- a/src/dissectBCL/dissect.py +++ b/src/dissectBCL/dissect.py @@ -95,6 +95,7 @@ def main(config): flowcell.lanes, config ) + inspect(sampleSheet) exitStats['premux'] = prepConvert( flowcell, sampleSheet, diff --git a/src/dissectBCL/drHouse.py b/src/dissectBCL/drHouse.py index ff4dcbe..1f98a40 100644 --- a/src/dissectBCL/drHouse.py +++ b/src/dissectBCL/drHouse.py @@ -6,6 +6,8 @@ import glob import datetime import logging +import sys +from Bio.Seq import Seq def getDiskSpace(outputDir): @@ -66,6 +68,61 @@ def matchOptdupsReqs(optDups, ssdf): return (sorted(_optDups, key=lambda x: x[1])) +def differentialDiagnosis(outPath, dualIx): + ''' + Takes the path for an outlane, + find out if all samples are empty + if that is the case, and the run is dualindexed, + rerun bclConvert with all P5s RC'ed. + ''' + # Known barcodes + KBCPath = os.path.join( + outPath, + 'Reports', + 'Demultiplex_Stats.csv' + ) + kbcDF = pd.read_csv(KBCPath) + # Test if > 90% of samples are virtually empty. + numLowreadSamples = len(kbcDF[kbcDF['# Reads'] < 1000]) + totalSamples = len(kbcDF[kbcDF['SampleID'] != 'Undetermined']) + if not numLowreadSamples/totalSamples == 1: + return (False) + logging.warning( + 'More then 90% samples empty. Attempting to salvage by RC the P5.' + ) + if not dualIx: # Only RC P5 operations for now. + return (False) + + # Read demuxSheet + demuxSheetPath = os.path.join( + outPath, 'demuxSheet.csv' + ) + demuxHeaders = [] + demuxSheet = [] + with open(demuxSheetPath) as f: + headStatus = True + for line in f: + if headStatus: + demuxHeaders.append(line.strip().split(',')) + else: + demuxSheetLine = line.strip().split(',') + ixPos = colnames.index('index2') + oldIx = demuxSheetLine[ixPos] + newIx = str(Seq(oldIx).reverse_complement()) + demuxSheetLine[ixPos] = newIx + demuxSheet.append(demuxSheetLine) + if 'Sample_ID' in line.strip(): + headStatus = False + colnames = line.strip().split(',') + shutil.move( + demuxSheetPath, + demuxSheetPath+'.bak' + ) + with open(demuxSheetPath, 'w') as f: + for l in demuxHeaders + demuxSheet: + f.write(','.join(l) +'\n') + return (True) + def initClass( outPath, initTime, flowcellID, ssDic, transferTime, exitStats, solPath ): @@ -220,5 +277,6 @@ def initClass( mismatch=mismatch, barcodeMask=barcodeMask, transferTime=transferTime, - exitStats=exitStats + exitStats=exitStats, + P5RC=ssDic['P5RC'] )) From fe0dceee4a12610aa5be955ee6d3746470c737c4 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 25 Apr 2023 17:01:13 +0200 Subject: [PATCH 02/14] update email with some red --- src/dissectBCL/classes.py | 10 ++++++++-- src/dissectBCL/demux.py | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 2f33c18..7900ae9 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -402,8 +402,6 @@ def spaceGood(freeSpace): message += "return {}: {}\n".format( subkey, self.exitStats[key][subkey] ) - if self.P5RC: - message += "\nNote that the P5s have been reverse complemented automatically!\n" # undetermined table undtableHead = ["P7", "P5", "# reads (M)", "% of und. Reads"] @@ -481,7 +479,14 @@ def optDupRet(optDup): parkourOrg # parkourOrg ] ) + if not self.P5RC: + P5RCstr = '' + else: + P5RCstr = '\n\nNote that the P5s have been reverse complemented automatically !' + P5RCstr += '\nThe multiQC report contains the barcodes as they are used for demultiplexing.\n' + msg = _html.render() +\ + P5RCstr +\ '

Top unknown barcodes

' +\ tabulate(undtableCont, undtableHead, tablefmt="html") +\ '

Samples

' +\ @@ -522,3 +527,4 @@ def __init__( self.mismatch = mismatch self.transferTime = transferTime self.exitStats = exitStats + self.P5RC = P5RC diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 9ba1ec1..0de26bc 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -505,6 +505,9 @@ def demux(sampleSheet, flowcell, config): sampleSheet.ssDic[outLane]['sampleSheet'], manual_df ) + # Check for 'bak file' existence. + if os.path.exists(demuxOut + '.bak'): + sampleSheet.ssDic[outLane]['P5RC'] = True # Don't run bcl-convert if we have the touched flag. if not os.path.exists( os.path.join(outputFolder, 'bclconvert.done') From 5dbe8d9634de65a6c76e5e8bda23d9d9f0ae6773 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 26 Apr 2023 11:49:03 +0200 Subject: [PATCH 03/14] flake minor formatting --- src/dissectBCL/classes.py | 10 +++++++--- src/dissectBCL/demux.py | 19 ++++++++++--------- src/dissectBCL/drHouse.py | 29 +++++++++++++++-------------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 7900ae9..389e79d 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -482,9 +482,13 @@ def optDupRet(optDup): if not self.P5RC: P5RCstr = '' else: - P5RCstr = '\n\nNote that the P5s have been reverse complemented automatically !' - P5RCstr += '\nThe multiQC report contains the barcodes as they are used for demultiplexing.\n' - + P5RCstr = '\n\n ' + P5RCstr += 'Note that the P5s have been reverse complemented ' + P5RCstr += 'automatically. \n\n' + P5RCstr += 'The multiqc report contains ' + P5RCstr += 'the index sequences ' + P5RCstr += 'as they are used for demultiplexing.' + msg = _html.render() +\ P5RCstr +\ '

Top unknown barcodes

' +\ diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 0de26bc..572824c 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -322,7 +322,7 @@ def writeDemuxSheet(demuxOut, ssDic, laneSplitStatus): f.write('{}\n'.format(line)) -def readDemuxSheet(demuxSheet): +def readDemuxSheet(demuxSheet, what='all'): ''' In case of manual intervention. We want to have the correct info in reports / emails. @@ -371,7 +371,10 @@ def readDemuxSheet(demuxSheet): mask except NameError: mask = None - return (mask, df, dualIx, mmdic) + if what == 'all': + return (mask, df, dualIx, mmdic) + elif what == 'df': + return (df) def parseStats(outputFolder, ssdf): @@ -538,7 +541,7 @@ def demux(sampleSheet, flowcell, config): Path( os.path.join(outputFolder, 'bclconvert.done') ).touch() - if flowcell.sequencer == 'MiSeq': + if flowcell.sequencer == 'MiSeq': if differentialDiagnosis( outputFolder, sampleSheet.ssDic[outLane]['dualIx'], @@ -549,7 +552,6 @@ def demux(sampleSheet, flowcell, config): shutil.rmtree( os.path.join(outputFolder, 'Reports') ) - # bclRunner = Popen( bclOpts, stdout=PIPE @@ -559,12 +561,11 @@ def demux(sampleSheet, flowcell, config): "bclConvert P5fix exit {}".format(exitcode) ) # Update the sampleSheet with proper RC'ed indices. - manual_mask, manual_df, manual_dualIx, man_mmdic = readDemuxSheet( - demuxOut - ) - sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( + sampleSheet.ssDic[outLane][ + 'sampleSheet' + ] = matchingSheets( sampleSheet.ssDic[outLane]['sampleSheet'], - manual_df + readDemuxSheet(demuxOut, what='df') ) sampleSheet.ssDic[outLane]['P5RC'] = True else: diff --git a/src/dissectBCL/drHouse.py b/src/dissectBCL/drHouse.py index 1f98a40..533cb5e 100644 --- a/src/dissectBCL/drHouse.py +++ b/src/dissectBCL/drHouse.py @@ -6,7 +6,6 @@ import glob import datetime import logging -import sys from Bio.Seq import Seq @@ -90,39 +89,41 @@ def differentialDiagnosis(outPath, dualIx): logging.warning( 'More then 90% samples empty. Attempting to salvage by RC the P5.' ) - if not dualIx: # Only RC P5 operations for now. + if not dualIx: # Only RC P5 operations for now. return (False) # Read demuxSheet demuxSheetPath = os.path.join( outPath, 'demuxSheet.csv' ) - demuxHeaders = [] demuxSheet = [] with open(demuxSheetPath) as f: headStatus = True for line in f: - if headStatus: - demuxHeaders.append(line.strip().split(',')) - else: - demuxSheetLine = line.strip().split(',') - ixPos = colnames.index('index2') - oldIx = demuxSheetLine[ixPos] - newIx = str(Seq(oldIx).reverse_complement()) - demuxSheetLine[ixPos] = newIx - demuxSheet.append(demuxSheetLine) if 'Sample_ID' in line.strip(): headStatus = False colnames = line.strip().split(',') + demuxSheet.append(colnames) + if headStatus: + demuxSheet.append(line.strip().split(',')) + else: + if 'Sample_ID' not in line.strip(): + demuxSheetLine = line.strip().split(',') + ixPos = colnames.index('index2') + oldIx = demuxSheetLine[ixPos] + newIx = str(Seq(oldIx).reverse_complement()) + demuxSheetLine[ixPos] = newIx + demuxSheet.append(demuxSheetLine) shutil.move( demuxSheetPath, demuxSheetPath+'.bak' ) with open(demuxSheetPath, 'w') as f: - for l in demuxHeaders + demuxSheet: - f.write(','.join(l) +'\n') + for _l in demuxSheet: + f.write(','.join(_l) + '\n') return (True) + def initClass( outPath, initTime, flowcellID, ssDic, transferTime, exitStats, solPath ): From ee8f23ecd8f95aa6a0e9080db8c6e6bae58e0774 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 17 May 2023 14:54:46 +0200 Subject: [PATCH 04/14] purge apostrofs --- src/dissectBCL/misc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dissectBCL/misc.py b/src/dissectBCL/misc.py index 4caf847..d137daa 100644 --- a/src/dissectBCL/misc.py +++ b/src/dissectBCL/misc.py @@ -364,6 +364,7 @@ def umlautDestroyer(germanWord): _o = 'ö'.encode() _O = 'Ö'.encode() _ss = 'ß'.encode() + _apstrf = "'".encode() _string = germanWord.encode() _string = _string.replace(_u, b'u') @@ -374,6 +375,7 @@ def umlautDestroyer(germanWord): _string = _string.replace(_o, b'o') _string = _string.replace(_O, b'O') _string = _string.replace(_ss, b'ss') + _string = _string.replace(_apstrf, b'') return (_string.decode('utf-8').replace(' ', '')) From 51e59bca65807f1efb2c76ec80c7290b7f04a317 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 25 Apr 2023 16:23:50 +0200 Subject: [PATCH 05/14] automated P5 RC in Miseqs run if all samples are empty --- .pre-commit-config.yaml | 2 +- ChangeLog | 4 +++ src/dissectBCL/classes.py | 6 +++- src/dissectBCL/demux.py | 63 ++++++++++++++++++++++----------------- src/dissectBCL/dissect.py | 1 + src/dissectBCL/drHouse.py | 60 ++++++++++++++++++++++++++++++++++++- 6 files changed, 105 insertions(+), 31 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7c0cdc7..9a9fe5a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,4 @@ repos: rev: "6.0.0" hooks: - id: flake8 - exclude: ^(build|docs|tests) + exclude: ^(build|docs|tests) \ No newline at end of file diff --git a/ChangeLog b/ChangeLog index e49ad1c..3f78b30 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,10 @@ CHANGES ======= +v0.2.3 +------ + +* sometimes projects are omitted from a flow cell (budgetary regions). Escape Nonetypes in the mqc building * escape nones in libtypes * include non-ommitted empty samples in kraken for email * flake fix/precommit version boost diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 1280253..2f33c18 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -402,6 +402,9 @@ def spaceGood(freeSpace): message += "return {}: {}\n".format( subkey, self.exitStats[key][subkey] ) + if self.P5RC: + message += "\nNote that the P5s have been reverse complemented automatically!\n" + # undetermined table undtableHead = ["P7", "P5", "# reads (M)", "% of und. Reads"] undtableCont = [] @@ -502,7 +505,8 @@ def __init__( barcodeMask, mismatch, transferTime, - exitStats + exitStats, + P5RC ): self.undetermined = undetermined self.totalReads = totalReads diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 42c2e1d..9ba1ec1 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -1,6 +1,7 @@ from dissectBCL.misc import joinLis, hamming, lenMask from dissectBCL.misc import P5Seriesret, matchingSheets from dissectBCL.fakeNews import mailHome +from dissectBCL.drHouse import differentialDiagnosis from itertools import combinations import os from subprocess import Popen, PIPE @@ -9,6 +10,7 @@ import pandas as pd import numpy as np import logging +import shutil def hamming2Mismatch(minVal): @@ -56,7 +58,8 @@ def detMask(seqRecipe, sampleSheetDF, outputFolder): scATACl = [ "scATAC-Seq 10xGenomics", "NextGEM_Multiome_ATAC", - "Next GEM Single Cell ATAC" + "Next GEM Single Cell ATAC", + "MUXscATAC-seq v3" ] # initialize variables mask = [] @@ -532,36 +535,39 @@ def demux(sampleSheet, flowcell, config): Path( os.path.join(outputFolder, 'bclconvert.done') ).touch() - elif exitcode == -6: - # known bug async, related to I/O lag over network (?). - # Illumina tech support wasn't sure, ticket still open. - bclRunner = Popen( - bclOpts, - stdout=PIPE - ) - exitcode = bclRunner.wait() - if exitcode == 0: - logging.info( - "bclConvert exit {} after second try.".format( - exitcode + if flowcell.sequencer == 'MiSeq': + if differentialDiagnosis( + outputFolder, + sampleSheet.ssDic[outLane]['dualIx'], + ): + logging.info("P5 RC triggered.") + # Purge existing reports. + logging.info("Purge existing Reports folder") + shutil.rmtree( + os.path.join(outputFolder, 'Reports') ) - ) - Path( - os.path.join(outputFolder, 'bclconvert.done') - ).touch() + # + bclRunner = Popen( + bclOpts, + stdout=PIPE + ) + exitcode = bclRunner.wait() + logging.info( + "bclConvert P5fix exit {}".format(exitcode) + ) + # Update the sampleSheet with proper RC'ed indices. + manual_mask, manual_df, manual_dualIx, man_mmdic = readDemuxSheet( + demuxOut + ) + sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( + sampleSheet.ssDic[outLane]['sampleSheet'], + manual_df + ) + sampleSheet.ssDic[outLane]['P5RC'] = True else: - logging.critical("bclConvert exit {}".format(exitcode)) - mailHome( - outLane, - 'BCL-convert exit {}. Investigate.'.format( - exitcode - ), - config, - toCore=True - ) - sys.exit(1) + sampleSheet.ssDic[outLane]['P5RC'] = False else: - logging.critical("bclConvert exit {}".format(exitcode)) + logging.critical("bclConvert exit {}".format(exitcode)) mailHome( outLane, 'BCL-convert exit {}. Investigate.'.format( @@ -571,6 +577,7 @@ def demux(sampleSheet, flowcell, config): toCore=True ) sys.exit(1) + logging.info("Parsing stats for {}".format(outLane)) sampleSheet.ssDic[outLane]['sampleSheet'] = parseStats( outputFolder, diff --git a/src/dissectBCL/dissect.py b/src/dissectBCL/dissect.py index 6ecccf1..9a7e5e1 100644 --- a/src/dissectBCL/dissect.py +++ b/src/dissectBCL/dissect.py @@ -95,6 +95,7 @@ def main(config): flowcell.lanes, config ) + inspect(sampleSheet) exitStats['premux'] = prepConvert( flowcell, sampleSheet, diff --git a/src/dissectBCL/drHouse.py b/src/dissectBCL/drHouse.py index ff4dcbe..1f98a40 100644 --- a/src/dissectBCL/drHouse.py +++ b/src/dissectBCL/drHouse.py @@ -6,6 +6,8 @@ import glob import datetime import logging +import sys +from Bio.Seq import Seq def getDiskSpace(outputDir): @@ -66,6 +68,61 @@ def matchOptdupsReqs(optDups, ssdf): return (sorted(_optDups, key=lambda x: x[1])) +def differentialDiagnosis(outPath, dualIx): + ''' + Takes the path for an outlane, + find out if all samples are empty + if that is the case, and the run is dualindexed, + rerun bclConvert with all P5s RC'ed. + ''' + # Known barcodes + KBCPath = os.path.join( + outPath, + 'Reports', + 'Demultiplex_Stats.csv' + ) + kbcDF = pd.read_csv(KBCPath) + # Test if > 90% of samples are virtually empty. + numLowreadSamples = len(kbcDF[kbcDF['# Reads'] < 1000]) + totalSamples = len(kbcDF[kbcDF['SampleID'] != 'Undetermined']) + if not numLowreadSamples/totalSamples == 1: + return (False) + logging.warning( + 'More then 90% samples empty. Attempting to salvage by RC the P5.' + ) + if not dualIx: # Only RC P5 operations for now. + return (False) + + # Read demuxSheet + demuxSheetPath = os.path.join( + outPath, 'demuxSheet.csv' + ) + demuxHeaders = [] + demuxSheet = [] + with open(demuxSheetPath) as f: + headStatus = True + for line in f: + if headStatus: + demuxHeaders.append(line.strip().split(',')) + else: + demuxSheetLine = line.strip().split(',') + ixPos = colnames.index('index2') + oldIx = demuxSheetLine[ixPos] + newIx = str(Seq(oldIx).reverse_complement()) + demuxSheetLine[ixPos] = newIx + demuxSheet.append(demuxSheetLine) + if 'Sample_ID' in line.strip(): + headStatus = False + colnames = line.strip().split(',') + shutil.move( + demuxSheetPath, + demuxSheetPath+'.bak' + ) + with open(demuxSheetPath, 'w') as f: + for l in demuxHeaders + demuxSheet: + f.write(','.join(l) +'\n') + return (True) + def initClass( outPath, initTime, flowcellID, ssDic, transferTime, exitStats, solPath ): @@ -220,5 +277,6 @@ def initClass( mismatch=mismatch, barcodeMask=barcodeMask, transferTime=transferTime, - exitStats=exitStats + exitStats=exitStats, + P5RC=ssDic['P5RC'] )) From 9f333ebfc22919867e555db06598f54c9773acd9 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 25 Apr 2023 17:01:13 +0200 Subject: [PATCH 06/14] update email with some red --- src/dissectBCL/classes.py | 10 ++++++++-- src/dissectBCL/demux.py | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 2f33c18..7900ae9 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -402,8 +402,6 @@ def spaceGood(freeSpace): message += "return {}: {}\n".format( subkey, self.exitStats[key][subkey] ) - if self.P5RC: - message += "\nNote that the P5s have been reverse complemented automatically!\n" # undetermined table undtableHead = ["P7", "P5", "# reads (M)", "% of und. Reads"] @@ -481,7 +479,14 @@ def optDupRet(optDup): parkourOrg # parkourOrg ] ) + if not self.P5RC: + P5RCstr = '' + else: + P5RCstr = '\n\nNote that the P5s have been reverse complemented automatically !' + P5RCstr += '\nThe multiQC report contains the barcodes as they are used for demultiplexing.\n' + msg = _html.render() +\ + P5RCstr +\ '

Top unknown barcodes

' +\ tabulate(undtableCont, undtableHead, tablefmt="html") +\ '

Samples

' +\ @@ -522,3 +527,4 @@ def __init__( self.mismatch = mismatch self.transferTime = transferTime self.exitStats = exitStats + self.P5RC = P5RC diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 9ba1ec1..0de26bc 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -505,6 +505,9 @@ def demux(sampleSheet, flowcell, config): sampleSheet.ssDic[outLane]['sampleSheet'], manual_df ) + # Check for 'bak file' existence. + if os.path.exists(demuxOut + '.bak'): + sampleSheet.ssDic[outLane]['P5RC'] = True # Don't run bcl-convert if we have the touched flag. if not os.path.exists( os.path.join(outputFolder, 'bclconvert.done') From 5f1e46c775a9e04af3c6050a96775549003d1a20 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 26 Apr 2023 11:49:03 +0200 Subject: [PATCH 07/14] flake minor formatting --- src/dissectBCL/classes.py | 10 +++++++--- src/dissectBCL/demux.py | 19 ++++++++++--------- src/dissectBCL/drHouse.py | 29 +++++++++++++++-------------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 7900ae9..389e79d 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -482,9 +482,13 @@ def optDupRet(optDup): if not self.P5RC: P5RCstr = '' else: - P5RCstr = '\n\nNote that the P5s have been reverse complemented automatically !' - P5RCstr += '\nThe multiQC report contains the barcodes as they are used for demultiplexing.\n' - + P5RCstr = '\n\n ' + P5RCstr += 'Note that the P5s have been reverse complemented ' + P5RCstr += 'automatically. \n\n' + P5RCstr += 'The multiqc report contains ' + P5RCstr += 'the index sequences ' + P5RCstr += 'as they are used for demultiplexing.' + msg = _html.render() +\ P5RCstr +\ '

Top unknown barcodes

' +\ diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 0de26bc..572824c 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -322,7 +322,7 @@ def writeDemuxSheet(demuxOut, ssDic, laneSplitStatus): f.write('{}\n'.format(line)) -def readDemuxSheet(demuxSheet): +def readDemuxSheet(demuxSheet, what='all'): ''' In case of manual intervention. We want to have the correct info in reports / emails. @@ -371,7 +371,10 @@ def readDemuxSheet(demuxSheet): mask except NameError: mask = None - return (mask, df, dualIx, mmdic) + if what == 'all': + return (mask, df, dualIx, mmdic) + elif what == 'df': + return (df) def parseStats(outputFolder, ssdf): @@ -538,7 +541,7 @@ def demux(sampleSheet, flowcell, config): Path( os.path.join(outputFolder, 'bclconvert.done') ).touch() - if flowcell.sequencer == 'MiSeq': + if flowcell.sequencer == 'MiSeq': if differentialDiagnosis( outputFolder, sampleSheet.ssDic[outLane]['dualIx'], @@ -549,7 +552,6 @@ def demux(sampleSheet, flowcell, config): shutil.rmtree( os.path.join(outputFolder, 'Reports') ) - # bclRunner = Popen( bclOpts, stdout=PIPE @@ -559,12 +561,11 @@ def demux(sampleSheet, flowcell, config): "bclConvert P5fix exit {}".format(exitcode) ) # Update the sampleSheet with proper RC'ed indices. - manual_mask, manual_df, manual_dualIx, man_mmdic = readDemuxSheet( - demuxOut - ) - sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( + sampleSheet.ssDic[outLane][ + 'sampleSheet' + ] = matchingSheets( sampleSheet.ssDic[outLane]['sampleSheet'], - manual_df + readDemuxSheet(demuxOut, what='df') ) sampleSheet.ssDic[outLane]['P5RC'] = True else: diff --git a/src/dissectBCL/drHouse.py b/src/dissectBCL/drHouse.py index 1f98a40..533cb5e 100644 --- a/src/dissectBCL/drHouse.py +++ b/src/dissectBCL/drHouse.py @@ -6,7 +6,6 @@ import glob import datetime import logging -import sys from Bio.Seq import Seq @@ -90,39 +89,41 @@ def differentialDiagnosis(outPath, dualIx): logging.warning( 'More then 90% samples empty. Attempting to salvage by RC the P5.' ) - if not dualIx: # Only RC P5 operations for now. + if not dualIx: # Only RC P5 operations for now. return (False) # Read demuxSheet demuxSheetPath = os.path.join( outPath, 'demuxSheet.csv' ) - demuxHeaders = [] demuxSheet = [] with open(demuxSheetPath) as f: headStatus = True for line in f: - if headStatus: - demuxHeaders.append(line.strip().split(',')) - else: - demuxSheetLine = line.strip().split(',') - ixPos = colnames.index('index2') - oldIx = demuxSheetLine[ixPos] - newIx = str(Seq(oldIx).reverse_complement()) - demuxSheetLine[ixPos] = newIx - demuxSheet.append(demuxSheetLine) if 'Sample_ID' in line.strip(): headStatus = False colnames = line.strip().split(',') + demuxSheet.append(colnames) + if headStatus: + demuxSheet.append(line.strip().split(',')) + else: + if 'Sample_ID' not in line.strip(): + demuxSheetLine = line.strip().split(',') + ixPos = colnames.index('index2') + oldIx = demuxSheetLine[ixPos] + newIx = str(Seq(oldIx).reverse_complement()) + demuxSheetLine[ixPos] = newIx + demuxSheet.append(demuxSheetLine) shutil.move( demuxSheetPath, demuxSheetPath+'.bak' ) with open(demuxSheetPath, 'w') as f: - for l in demuxHeaders + demuxSheet: - f.write(','.join(l) +'\n') + for _l in demuxSheet: + f.write(','.join(_l) + '\n') return (True) + def initClass( outPath, initTime, flowcellID, ssDic, transferTime, exitStats, solPath ): From 115a4fc3d763131ea550feef0d7ce58fcf5ba448 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 17 May 2023 14:54:46 +0200 Subject: [PATCH 08/14] purge apostrofs --- src/dissectBCL/misc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dissectBCL/misc.py b/src/dissectBCL/misc.py index 4caf847..d137daa 100644 --- a/src/dissectBCL/misc.py +++ b/src/dissectBCL/misc.py @@ -364,6 +364,7 @@ def umlautDestroyer(germanWord): _o = 'ö'.encode() _O = 'Ö'.encode() _ss = 'ß'.encode() + _apstrf = "'".encode() _string = germanWord.encode() _string = _string.replace(_u, b'u') @@ -374,6 +375,7 @@ def umlautDestroyer(germanWord): _string = _string.replace(_o, b'o') _string = _string.replace(_O, b'O') _string = _string.replace(_ss, b'ss') + _string = _string.replace(_apstrf, b'') return (_string.decode('utf-8').replace(' ', '')) From 233863d3a5467f5456a433ab26be7da9e0132566 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 17 May 2023 16:15:27 +0200 Subject: [PATCH 09/14] fly rename contam --- contaminome.yml | 2 +- src/tools/prep_contaminome.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contaminome.yml b/contaminome.yml index d6b4486..51f10be 100644 --- a/contaminome.yml +++ b/contaminome.yml @@ -11,7 +11,7 @@ eukaryotes: taxid: 10090 Drosophila melanogaster: URL: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna.gz - vulgarname: drosophila + vulgarname: fly accession: GCF_000001215.4 taxid: 7227 Aedes aegypti: diff --git a/src/tools/prep_contaminome.py b/src/tools/prep_contaminome.py index db103dc..5aa844c 100644 --- a/src/tools/prep_contaminome.py +++ b/src/tools/prep_contaminome.py @@ -11,14 +11,14 @@ ignore_chrs = { 'human': ['NC_012920.1'], # human mito 'mouse': ['NC_005089.1'], # mouse mito - 'drosophila': ['NC_024511.2'], # fly mito + 'fly': ['NC_024511.2'], # fly mito 'aedes-aegypti': ['NC_035159.1'] # aedes mito } rrna_mask = [ ('human', 'humanrrna'), ('mouse', 'mouserrna'), - ('drosophila', 'flyrrna'), + ('fly', 'flyrrna'), ('aedes-aegypti', 'aedesaegyptirrna') ] @@ -39,7 +39,7 @@ 'progrp': [14, 5, 'family'], 'human': [9606, 9, 'species'], 'mouse': [10090, 10, 'species'], - 'drosophila': [7227, 11, 'species'], + 'fly': [7227, 11, 'species'], 'aedes-aegypti': [7159, 13, 'species'], 'sea-lamprey': [7757, 13, 'species'], 'japanese-medaka': [8090, 13, 'species'], From 914f7e011d03afd0313a7abf896e023ec4e748c0 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 19 May 2023 15:24:11 +0200 Subject: [PATCH 10/14] include kraken explanation from ini file in mqc as feature --- dissectBCL.ini | 1 + src/dissectBCL/fakeNews.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dissectBCL.ini b/dissectBCL.ini index 29cd72d..80ad3b5 100644 --- a/dissectBCL.ini +++ b/dissectBCL.ini @@ -27,6 +27,7 @@ kraken2db=/path/to/kraken2_contaminome/contaminomedb [misc] mpiImg=/path/to/multiqc_headerimg.jpg +krakenExpl=" Kraken is used to classify the reads and to detect contamination.
For this we use a *custom* database, with a simplified taxonomical hierarchy (that no longer resembles any true taxonomical classification.
In brief, by default we screen for:
  • eukaryotes (human, mouse, fly, mosquito, lamprey, medaka, c-elegans, yeast, zebrafish and the moss-piglet)
  • prokaryotes (Ecoli, pseudomonas, mycoplasma and haemophilus influenza)
  • viruses (sars-cov2, influenza A,B & C, norwalk virus, rhinoviruses, drosophila C virus, phiX and lambda phage )
  • custom databases (ERCC spikes, univec core DB)
  • Note that for human, mouse, fly and mosquito we scan for mitochondrial and ribosomal contamination separately).
    Only the top (most abundant) five hits and unclassified hits are shown, all other hits are grouped under an 'other' tag.
    " [communication] deepSeq=email@seqfacility.de diff --git a/src/dissectBCL/fakeNews.py b/src/dissectBCL/fakeNews.py index 92e33ef..c1ff58a 100644 --- a/src/dissectBCL/fakeNews.py +++ b/src/dissectBCL/fakeNews.py @@ -312,6 +312,7 @@ def multiQC_yaml(config, flowcell, ssDic, project, laneFolder): ) except TypeError: sumReqRound = 'NA' + mqcyml = { "title": project, "custom_logo": config["misc"]["mpiImg"], @@ -343,7 +344,11 @@ def multiQC_yaml(config, flowcell, ssDic, project, laneFolder): 0 ) )} - ] + ], + "section_comments": { + "kraken": config["misc"]['krakenExpl'] + } + } return (mqcyml, mqcData, seqreportData, indexreportData) From 79204782a64fd5eb5d709570fd271543601b4e79 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 23 May 2023 08:55:01 +0200 Subject: [PATCH 11/14] CompletionStatus check and initiation of failedrun --- src/dissectBCL/classes.py | 21 ++- src/dissectBCL/demux.py | 302 +++++++++++++++++++++----------------- src/dissectBCL/dissect.py | 98 +++++++------ src/dissectBCL/misc.py | 4 + 4 files changed, 240 insertions(+), 185 deletions(-) diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 389e79d..1cadce8 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -31,7 +31,8 @@ def filesExist(self): self.origSS, self.runInfo, self.inBaseDir, - self.outBaseDir + self.outBaseDir, + self.runCompletionStatus ]: logging.info("Checking {}".format(f)) if not os.path.exists(f): @@ -79,12 +80,26 @@ def parseRunInfo(self): flowcellID = i.text return seqRecipe, lanes, instrument, flowcellID + # Validate successful run. + def validateRunCompletion(self): + """ + validates succesfull completion status in xml. + """ + logging.info("Parsing RunCompletionStatus.xml") + tree = ET.parse(self.runCompletionStatus) + root = tree.getroot() + for i in root.iter(): + if i.tag == 'CompletionStatus': + _status = i.text + return (_status) + def __init__( self, name, bclPath, origSS, runInfo, + runCompStat, inBaseDir, outBaseDir, logFile, @@ -101,12 +116,14 @@ def __init__( self.bclPath = bclPath self.origSS = origSS self.runInfo = runInfo + self.runCompletionStatus = runCompStat self.inBaseDir = inBaseDir self.outBaseDir = outBaseDir self.logFile = logFile self.config = config # Run filesChecks self.filesExist() + self.succesfullrun = self.validateRunCompletion() # populate runInfo vars. self.seqRecipe, \ self.lanes, \ @@ -121,6 +138,8 @@ def asdict(self): 'bclPath': self.bclPath, 'original sampleSheet': self.origSS, 'runInfo': self.runInfo, + 'runCompletionStatus': self.runCompletionStatus, + 'sucessfulRun': self.succesfullrun, 'inBaseDir': self.inBaseDir, 'outBaseDir': self.outBaseDir, 'dissect logFile': self.logFile, diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index 572824c..b373955 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -441,150 +441,178 @@ def parseStats(outputFolder, ssdf): def demux(sampleSheet, flowcell, config): logging.warning("Demux module") - for outLane in sampleSheet.ssDic: - logging.info("Demuxing {}".format(outLane)) - # Check outDir - outputFolder = os.path.join(flowcell.outBaseDir, outLane) - if not os.path.exists(outputFolder): - os.mkdir(outputFolder) - logging.info("{} created.".format(outputFolder)) - else: - logging.info("{} already exists. Moving on.".format(outputFolder)) - # Write the demuxSheet in the outputfolder - demuxOut = os.path.join(outputFolder, "demuxSheet.csv") - # Don't remake if demuxSheet exist - if not os.path.exists(demuxOut): - logging.info("Writing demuxSheet for {}".format(outLane)) - writeDemuxSheet( - demuxOut, - sampleSheet.ssDic[outLane], - sampleSheet.laneSplitStatus - ) - else: - logging.warning( - "demuxSheet for {} already exists!".format(outLane) + # Double check for run failure + if flowcell.succesfullrun != 'SuccessfullyCompleted': + for outLane in sampleSheet.ssDic: + outputFolder = os.path.join( + flowcell.outBaseDir, outLane ) - manual_mask, manual_df, manual_dualIx, man_mmdic = readDemuxSheet( - demuxOut - ) - if ( - sampleSheet.ssDic[outLane]['mismatch'] != man_mmdic - ): - logging.info( - "mismatch dic is changed from {} into {}".format( - sampleSheet.ssDic[outLane]['mismatch'], - man_mmdic - ) - ) - sampleSheet.ssDic[outLane]['mismatch'] = man_mmdic - # if mask is changed, update: - # Mask - if ( - 'mask' in sampleSheet.ssDic[outLane] - and manual_mask != sampleSheet.ssDic[outLane]['mask'] - ): - logging.info( - "Mask is changed from {} into {}.".format( - sampleSheet.ssDic[outLane]['mask'], - manual_mask - ) - ) - sampleSheet.ssDic[outLane]['mask'] = manual_mask - # dualIx status - if ( - 'dualIx' in sampleSheet.ssDic[outLane] - and manual_dualIx != sampleSheet.ssDic[outLane]['dualIx'] - ): + if not os.path.exists(outputFolder): + os.mkdir(outputFolder) + Path( + os.path.join(outputFolder, 'run.failed') + ).touch() + return ('sequencingfailed') + else: + for outLane in sampleSheet.ssDic: + logging.info("Demuxing {}".format(outLane)) + # Check outDir + outputFolder = os.path.join(flowcell.outBaseDir, outLane) + if not os.path.exists(outputFolder): + os.mkdir(outputFolder) + logging.info("{} created.".format(outputFolder)) + else: logging.info( - "dualIx is changed from {} into {}.".format( - sampleSheet.ssDic[outLane]['dualIx'], - manual_dualIx - ) + "{} already exists. Moving on.".format(outputFolder) ) - sampleSheet.ssDic[outLane]['dualIx'] = manual_dualIx - - # sampleSheet - sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( - sampleSheet.ssDic[outLane]['sampleSheet'], - manual_df - ) - # Check for 'bak file' existence. - if os.path.exists(demuxOut + '.bak'): - sampleSheet.ssDic[outLane]['P5RC'] = True - # Don't run bcl-convert if we have the touched flag. - if not os.path.exists( - os.path.join(outputFolder, 'bclconvert.done') - ): - # Run bcl-convert - bclOpts = [ - config['software']['bclconvert'], - '--output-directory', outputFolder, - '--force', - '--bcl-input-directory', flowcell.bclPath, - '--sample-sheet', demuxOut, - '--bcl-num-conversion-threads', "20", - '--bcl-num-compression-threads', "20", - "--bcl-sampleproject-subdirectories", "true", - ] - if not sampleSheet.laneSplitStatus: - bclOpts.append('--no-lane-splitting') - bclOpts.append('true') - logging.info("Starting BCLConvert") - logging.info(" ".join(bclOpts)) - bclRunner = Popen( - bclOpts, - stdout=PIPE - ) - exitcode = bclRunner.wait() - if exitcode == 0: - logging.info("bclConvert exit {}".format(exitcode)) + if flowcell.succesfullrun != 'SuccessfullyCompleted': + print("In failure if.") Path( - os.path.join(outputFolder, 'bclconvert.done') + os.path.join(outputFolder, 'run.failed') ).touch() - if flowcell.sequencer == 'MiSeq': - if differentialDiagnosis( - outputFolder, - sampleSheet.ssDic[outLane]['dualIx'], - ): - logging.info("P5 RC triggered.") - # Purge existing reports. - logging.info("Purge existing Reports folder") - shutil.rmtree( - os.path.join(outputFolder, 'Reports') - ) - bclRunner = Popen( - bclOpts, - stdout=PIPE + mailHome( + "{} ignored".format(flowcell.name), + "RunCompletionStatus is not successfullycompleted.\n" + + "Marked for failure and ignored for the future.", + config, + toCore=True + ) + break + # Write the demuxSheet in the outputfolder + demuxOut = os.path.join(outputFolder, "demuxSheet.csv") + # Don't remake if demuxSheet exist + if not os.path.exists(demuxOut): + logging.info("Writing demuxSheet for {}".format(outLane)) + writeDemuxSheet( + demuxOut, + sampleSheet.ssDic[outLane], + sampleSheet.laneSplitStatus + ) + else: + logging.warning( + "demuxSheet for {} already exists!".format(outLane) + ) + man_mask, man_df, man_dualIx, man_mmdic = readDemuxSheet( + demuxOut + ) + if ( + sampleSheet.ssDic[outLane]['mismatch'] != man_mmdic + ): + logging.info( + "mismatch dic is changed from {} into {}".format( + sampleSheet.ssDic[outLane]['mismatch'], + man_mmdic ) - exitcode = bclRunner.wait() - logging.info( - "bclConvert P5fix exit {}".format(exitcode) + ) + sampleSheet.ssDic[outLane]['mismatch'] = man_mmdic + # if mask is changed, update: + # Mask + if ( + 'mask' in sampleSheet.ssDic[outLane] + and man_mask != sampleSheet.ssDic[outLane]['mask'] + ): + logging.info( + "Mask is changed from {} into {}.".format( + sampleSheet.ssDic[outLane]['mask'], + man_mask ) - # Update the sampleSheet with proper RC'ed indices. - sampleSheet.ssDic[outLane][ - 'sampleSheet' - ] = matchingSheets( - sampleSheet.ssDic[outLane]['sampleSheet'], - readDemuxSheet(demuxOut, what='df') + ) + sampleSheet.ssDic[outLane]['mask'] = man_mask + # dualIx status + if ( + 'dualIx' in sampleSheet.ssDic[outLane] + and man_dualIx != sampleSheet.ssDic[outLane]['dualIx'] + ): + logging.info( + "dualIx is changed from {} into {}.".format( + sampleSheet.ssDic[outLane]['dualIx'], + man_dualIx ) - sampleSheet.ssDic[outLane]['P5RC'] = True - else: - sampleSheet.ssDic[outLane]['P5RC'] = False - else: - logging.critical("bclConvert exit {}".format(exitcode)) - mailHome( - outLane, - 'BCL-convert exit {}. Investigate.'.format( - exitcode - ), - config, - toCore=True ) - sys.exit(1) + sampleSheet.ssDic[outLane]['dualIx'] = man_dualIx - logging.info("Parsing stats for {}".format(outLane)) - sampleSheet.ssDic[outLane]['sampleSheet'] = parseStats( - outputFolder, - sampleSheet.ssDic[outLane]['sampleSheet'] - ) - return (0) + # sampleSheet + sampleSheet.ssDic[outLane]['sampleSheet'] = matchingSheets( + sampleSheet.ssDic[outLane]['sampleSheet'], + man_df + ) + # Check for 'bak file' existence. + if os.path.exists(demuxOut + '.bak'): + sampleSheet.ssDic[outLane]['P5RC'] = True + # Don't run bcl-convert if we have the touched flag. + if not os.path.exists( + os.path.join(outputFolder, 'bclconvert.done') + ): + # Run bcl-convert + bclOpts = [ + config['software']['bclconvert'], + '--output-directory', outputFolder, + '--force', + '--bcl-input-directory', flowcell.bclPath, + '--sample-sheet', demuxOut, + '--bcl-num-conversion-threads', "20", + '--bcl-num-compression-threads', "20", + "--bcl-sampleproject-subdirectories", "true", + ] + if not sampleSheet.laneSplitStatus: + bclOpts.append('--no-lane-splitting') + bclOpts.append('true') + logging.info("Starting BCLConvert") + logging.info(" ".join(bclOpts)) + bclRunner = Popen( + bclOpts, + stdout=PIPE + ) + exitcode = bclRunner.wait() + if exitcode == 0: + logging.info("bclConvert exit {}".format(exitcode)) + Path( + os.path.join(outputFolder, 'bclconvert.done') + ).touch() + if flowcell.sequencer == 'MiSeq': + if differentialDiagnosis( + outputFolder, + sampleSheet.ssDic[outLane]['dualIx'], + ): + logging.info("P5 RC triggered.") + # Purge existing reports. + logging.info("Purge existing Reports folder") + shutil.rmtree( + os.path.join(outputFolder, 'Reports') + ) + bclRunner = Popen( + bclOpts, + stdout=PIPE + ) + exitcode = bclRunner.wait() + logging.info( + "bclConvert P5fix exit {}".format(exitcode) + ) + # Update the sampleSheet with proper RC'ed indices. + sampleSheet.ssDic[outLane][ + 'sampleSheet' + ] = matchingSheets( + sampleSheet.ssDic[outLane]['sampleSheet'], + readDemuxSheet(demuxOut, what='df') + ) + sampleSheet.ssDic[outLane]['P5RC'] = True + else: + sampleSheet.ssDic[outLane]['P5RC'] = False + else: + logging.critical("bclConvert exit {}".format(exitcode)) + mailHome( + outLane, + 'BCL-convert exit {}. Investigate.'.format( + exitcode + ), + config, + toCore=True + ) + sys.exit(1) + + logging.info("Parsing stats for {}".format(outLane)) + sampleSheet.ssDic[outLane]['sampleSheet'] = parseStats( + outputFolder, + sampleSheet.ssDic[outLane]['sampleSheet'] + ) + return (0) diff --git a/src/dissectBCL/dissect.py b/src/dissectBCL/dissect.py index 9a7e5e1..0754ae7 100644 --- a/src/dissectBCL/dissect.py +++ b/src/dissectBCL/dissect.py @@ -84,6 +84,9 @@ def main(config): outBaseDir=config['Dirs']['outputDir'], origSS=os.path.join(flowcellDir, 'SampleSheet.csv'), runInfo=os.path.join(flowcellDir, 'RunInfo.xml'), + runCompStat=os.path.join( + flowcellDir, 'RunCompletionStatus.xml' + ), logFile=logFile, config=config ) @@ -108,59 +111,60 @@ def main(config): config ) inspect(sampleSheet) - - # postmux - exitStats['postmux'] = postmux( - flowcell, - sampleSheet, - config - ) - - # transfer data - for outLane in sampleSheet.ssDic: - # Copy over files. - transferTime, shipDic = fakeNews.shipFiles( - os.path.join( - flowcell.outBaseDir, - outLane - ), - config - ) - exitStats[outLane] = shipDic - # Push stats to parkour. - exitStats[outLane]['pushParkour'] = fakeNews.pushParkour( - flowcell.flowcellID, + # Break if the sequencing failed. + if not exitStats['demux'] == 'sequencingfailed': + # postmux + exitStats['postmux'] = postmux( + flowcell, sampleSheet, - config, - flowcell.bclPath + config ) - # Create diagnosis + parse QC stats - drHouse = initClass( - os.path.join( - flowcell.outBaseDir, - outLane - ), - flowcell.startTime, - sampleSheet.flowcell, - sampleSheet.ssDic[outLane], - transferTime, - exitStats, - config['Dirs']['baseDir'] + + # transfer data + for outLane in sampleSheet.ssDic: + # Copy over files. + transferTime, shipDic = fakeNews.shipFiles( + os.path.join( + flowcell.outBaseDir, + outLane + ), + config + ) + exitStats[outLane] = shipDic + # Push stats to parkour. + exitStats[outLane]['pushParkour'] = fakeNews.pushParkour( + flowcell.flowcellID, + sampleSheet, + config, + flowcell.bclPath ) - inspect(drHouse) - # Create email. - subject, _html = drHouse.prepMail() - # Send it. - mailHome(subject, _html, config) - Path( + # Create diagnosis + parse QC stats + drHouse = initClass( os.path.join( flowcell.outBaseDir, - outLane, - 'communication.done' + outLane + ), + flowcell.startTime, + sampleSheet.flowcell, + sampleSheet.ssDic[outLane], + transferTime, + exitStats, + config['Dirs']['baseDir'] ) - ).touch() - # Fix logs. - fakeNews.organiseLogs(flowcell, sampleSheet) + inspect(drHouse) + # Create email. + subject, _html = drHouse.prepMail() + # Send it. + mailHome(subject, _html, config) + Path( + os.path.join( + flowcell.outBaseDir, + outLane, + 'communication.done' + ) + ).touch() + # Fix logs. + fakeNews.organiseLogs(flowcell, sampleSheet) else: print("No flowcells found. Go back to sleep.") sleep(60*60) diff --git a/src/dissectBCL/misc.py b/src/dissectBCL/misc.py index d137daa..67a99a8 100644 --- a/src/dissectBCL/misc.py +++ b/src/dissectBCL/misc.py @@ -119,6 +119,10 @@ def getNewFlowCell(config, fPath=None): os.path.join( outBaseDir, flowcellName + '*', 'fastq.made' ) + ) and not glob.glob( + os.path.join( + outBaseDir, flowcellName + '*', 'run.failed' + ) ): return (flowcellName, flowcellDir) return (None, None) From 7abdf8fbfc6b5db1da081cebb6fb9f6237604118 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 24 May 2023 13:53:26 +0200 Subject: [PATCH 12/14] follow new seqfac structure --- src/dissectBCL/fakeNews.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/dissectBCL/fakeNews.py b/src/dissectBCL/fakeNews.py index c1ff58a..46156fc 100644 --- a/src/dissectBCL/fakeNews.py +++ b/src/dissectBCL/fakeNews.py @@ -18,6 +18,7 @@ import numpy as np import interop import logging +import pathlib def getDiskSpace(outputDir): @@ -511,12 +512,20 @@ def shipFiles(outPath, config): os.system(fexer) shipDic[project] = shipDicStat # Ship multiQC reports. + ''' + seqFacdir/Sequence_Quality_yyyy/Illumina_yyyy/outlane + ''' + yrstr = '20' + outLane[:2] seqFacDir = os.path.join( config['Dirs']['seqFacDir'], + 'Sequence_Quality_{}'.format(yrstr), + 'Illumina_{}'.format(yrstr), outLane ) if not os.path.exists(seqFacDir): - os.mkdir(seqFacDir) + pathlib.Path( + seqFacDir + ).mkdir(parents=True, exist_ok=True) for qcRepo in glob.glob( os.path.join(outPath, 'Project_*', 'multiqc_report.html') ): From 2457872d3bbd8771c4ac8d2f4b2ee64608c554ae Mon Sep 17 00:00:00 2001 From: WardDeb Date: Thu, 25 May 2023 08:23:16 +0200 Subject: [PATCH 13/14] escape validation w/ novaseq --- src/dissectBCL/classes.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 1cadce8..9f23a5b 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -31,8 +31,7 @@ def filesExist(self): self.origSS, self.runInfo, self.inBaseDir, - self.outBaseDir, - self.runCompletionStatus + self.outBaseDir ]: logging.info("Checking {}".format(f)) if not os.path.exists(f): @@ -85,12 +84,16 @@ def validateRunCompletion(self): """ validates succesfull completion status in xml. """ - logging.info("Parsing RunCompletionStatus.xml") - tree = ET.parse(self.runCompletionStatus) - root = tree.getroot() - for i in root.iter(): - if i.tag == 'CompletionStatus': - _status = i.text + logging.info("validateRunCompletion") + if self.sequencer == 'Miseq': + tree = ET.parse(self.runCompletionStatus) + root = tree.getroot() + for i in root.iter(): + if i.tag == 'CompletionStatus': + _status = i.text + else: + # no RunCompletionStatus.xml in novaseq, assume succes. + _status = 'SuccessfullyCompleted' return (_status) def __init__( From b431b6b25c76b68f9a77f18f4b15a3e5d28bc4ba Mon Sep 17 00:00:00 2001 From: WardDeb Date: Thu, 25 May 2023 10:55:37 +0200 Subject: [PATCH 14/14] bugfix rerun w/ existing samplesheet (P5RC flag missing if no .bak) --- src/dissectBCL/demux.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dissectBCL/demux.py b/src/dissectBCL/demux.py index b373955..75a9695 100644 --- a/src/dissectBCL/demux.py +++ b/src/dissectBCL/demux.py @@ -539,6 +539,8 @@ def demux(sampleSheet, flowcell, config): # Check for 'bak file' existence. if os.path.exists(demuxOut + '.bak'): sampleSheet.ssDic[outLane]['P5RC'] = True + else: + sampleSheet.ssDic[outLane]['P5RC'] = False # Don't run bcl-convert if we have the touched flag. if not os.path.exists( os.path.join(outputFolder, 'bclconvert.done')