diff --git a/ChangeLog b/ChangeLog index 8f2a4f0..9377de6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,10 +1,29 @@ CHANGES ======= +* split up zebrafish contamination into mito - rrna - zebrafish +* or it wasn't found on fexList // let's just omit info, code is the doc (?) +* autoreformatted for flake8 rule +* fixes +* works in my server :P +* FutureWarning: Calling int on a single element Series is deprecated and will raise a TypeError in the future +* flake8 fix +* URL should be complete +* use 1 config param (URL) instead of 3 +* flake8 E302 +* get\_contact\_details API endpoint // deprecated userList text file +* Lanesplit samples (#173) +* make lanesplit check on sampleIDs rather then samplenames, and lane aware +* Mycoplasma implement, email update (#172) +* Contam emails (#171) +* ChangeLog * actual seq\_data dir in email * mycoplasma include in prep * include mycoplasma hyorhinis +* docs updates (#170) +* auto version for docs, fix readthedocs yaml (#169) * auto version for docs, fix readthedocs yaml +* Docs (#168) * update changelog * include authors * make sure doc pytest includes reqs from the doc folder diff --git a/contaminome.yml b/contaminome.yml index 26c90ed..6a11283 100644 --- a/contaminome.yml +++ b/contaminome.yml @@ -162,6 +162,11 @@ rrna: vulgarname: aedesaegyptirrna accession: NC_035159.1 taxid: 71591111 + Zebrafish rRNA: + URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/zebrarRNA.fna.gz + vulgarname: zebrafishrrna + accession: NR_145818.1 + taxid: 79551111 mito: Homo sapiens mitochondrion: URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/humanmito.fna.gz @@ -182,4 +187,9 @@ mito: URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/aedesaegyptimito.fna.gz vulgarname: aedesaegyptimito accession: NC_035159.1 - taxid: 71592222 \ No newline at end of file + taxid: 71592222 + Zebrafish mitochondrion: + URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/zebramito.fna.gz + vulgarname: zebrafishmito + accession: NC_002333.2 + taxid: 79552222 diff --git a/dissectBCL.ini b/dissectBCL.ini index 80ad3b5..1eaf806 100644 --- a/dissectBCL.ini +++ b/dissectBCL.ini @@ -13,12 +13,10 @@ seqDir=seqfolderstr fex=False [parkour] -pullURL=parkour.pull.url/api -pushURL=parkour.push.url/api user=parkourUser password=parkourPw cert=/path/to/cert.pem -userList=filename_with_parkour_users +URL=parkour.domain.tld [software] bclconvert=/path/to/bclconvert @@ -30,6 +28,7 @@ mpiImg=/path/to/multiqc_headerimg.jpg krakenExpl="<font size="2"> Kraken is used to classify the reads and to detect contamination. <br> For this we use a *custom* database, with a simplified taxonomical hierarchy (that no longer resembles any true taxonomical classification. <br> In brief, by default we screen for: <li><b>eukaryotes</b> (human, mouse, fly, mosquito, lamprey, medaka, c-elegans, yeast, zebrafish and the moss-piglet)</li> <li><b>prokaryotes</b> (Ecoli, pseudomonas, mycoplasma and haemophilus influenza)</li> <li><b>viruses</b> (sars-cov2, influenza A,B & C, norwalk virus, rhinoviruses, drosophila C virus, phiX and lambda phage )</li> <li><b>custom databases</b> (ERCC spikes, univec core DB)</li> Note that for human, mouse, fly and mosquito we scan for mitochondrial and ribosomal contamination separately). <br> Only the top (most abundant) five hits and unclassified hits are shown, all other hits are grouped under an 'other' tag.</font>" [communication] +subject=dissectBCL deepSeq=email@seqfacility.de bioinfoCore=email@bioinfocore.de fromAddress=sender@dissectbcl.de diff --git a/docs/config.rst b/docs/config.rst index 6e04f8c..5e5fe9a 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -63,14 +63,10 @@ parkour The *parkour block* contains all necessary information to communicate with `parkour <https://github.com/maxplanck-ie/parkour2>`. Note that this block contains sensitive information. -#. pullURL: the URL to pull flowcell information from. Is parkoururl/api/analysis_list/analysis_list -#. pushURL: the URL to push flowcell statistics to. Is parkoururl/api/run_statistics/upload #. user: the username for API requests #. pw: the password for API requests #. cert: the pem certificate for API requests -#. userList: a headerless tsv file containing firstname lastname emailaddress lines. - -Note that the userList is used implicitly for the email command to notify end users. +#. URL: the URL to Parkour2, e.g. `https://parkour.yourdomain.tld`. .. _software: @@ -128,7 +124,7 @@ example user=parkourUser password=parkourPw cert=/path/to/cert.pem - userList=filename_with_parkour_users + URL=parkour.domain.tld [software] bclconvert=/path/to/bclconvert diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py index 280ff00..76f6f54 100644 --- a/src/dissectBCL/classes.py +++ b/src/dissectBCL/classes.py @@ -179,11 +179,18 @@ def decideSplit(self): laneSplitStatus = True # Do we need lane splitting or not ? # If there is at least one sample in more then 1 lane, we cannot split: - if sum(self.fullSS['Sample_Name'].value_counts() > 1) > 0: - logging.info( - "No lane splitting: >= 1 sample in multiple lanes." - ) - laneSplitStatus = False + samples = list(self.fullSS['Sample_ID'].unique()) + for _s in samples: + if len( + list(self.fullSS[ + self.fullSS['Sample_ID'] == _s + ]['Lane'].unique() + ) + ) > 1: + logging.info( + "No lane splitting: >= 1 sample in multiple lanes." + ) + laneSplitStatus = False # If one project is split over multiple lanes, we also don't split: projects = list(self.fullSS['Sample_Project'].unique()) for project in projects: diff --git a/src/dissectBCL/drHouse.py b/src/dissectBCL/drHouse.py index 533cb5e..4e2df8f 100644 --- a/src/dissectBCL/drHouse.py +++ b/src/dissectBCL/drHouse.py @@ -142,7 +142,11 @@ def initClass( muxDF = pd.read_csv(muxPath) totalReads = int(muxDF['# Reads'].sum()) if len(muxDF[muxDF['SampleID'] == 'Undetermined']) == 1: - undReads = int(muxDF[muxDF['SampleID'] == 'Undetermined']['# Reads']) + undReads = int( + muxDF[ + muxDF['SampleID'] == 'Undetermined' + ]['# Reads'].iloc[0] + ) else: undDic = dict( muxDF[ diff --git a/src/dissectBCL/fakeNews.py b/src/dissectBCL/fakeNews.py index 8613452..62db139 100644 --- a/src/dissectBCL/fakeNews.py +++ b/src/dissectBCL/fakeNews.py @@ -48,7 +48,7 @@ def pullParkour(flowcellID, config): ) d = {'flowcell_id': FID} res = requests.get( - config['parkour']['pullURL'], + config['parkour']['URL'] + '/api/analysis_list/analysis_list/', auth=( config['parkour']['user'], config['parkour']['password'] @@ -185,7 +185,7 @@ def pushParkour(flowcellID, sampleSheet, config, flowcellBase): d['matrix'] = json.dumps(list(laneDict.values())) logging.info("Pushing FID with dic {} {}".format(FID, d)) pushParkStat = requests.post( - config.get("parkour", "pushURL"), + config.get("parkour", "URL") + '/api/run_statistics/upload/', auth=( config.get("parkour", "user"), config.get("parkour", "password") @@ -360,7 +360,8 @@ def multiQC_yaml(config, flowcell, ssDic, project, laneFolder): def mailHome(subject, _html, config, toCore=False): mailer = MIMEMultipart('alternative') - mailer['Subject'] = '[dissectBCL] [{}] '.format( + mailer['Subject'] = '[{}] [{}] '.format( + config['communication']['subject'], version('dissectBCL') ) + subject mailer['From'] = config['communication']['fromAddress'] @@ -580,21 +581,30 @@ def organiseLogs(flowcell, sampleSheet): mvFile ) shutil.move(fileIn, fileOut) + # Write out ssdf. outssdf = os.path.join(_logDir, 'sampleSheetdf.tsv') sampleSheet.ssDic[outLane]['sampleSheet'].to_csv(outssdf, sep='\t') - # Write out the yaml files. - yaml = ruamel.yaml.YAML() - yaml.indent(mapping=2, sequence=4, offset=2) # write out outLaneInfo.yaml + dic0 = sampleSheet.ssDic[outLane] + del dic0['sampleSheet'] + yaml0 = ruamel.yaml.YAML() + yaml0.indent(mapping=2, sequence=4, offset=2) outLaneInfo = os.path.join(_logDir, 'outLaneInfo.yaml') - dic = sampleSheet.ssDic[outLane] - del dic['sampleSheet'] with open(outLaneInfo, 'w') as f: - ruamel.yaml.dump(dic, f) + yaml0.dump(dic0, f) + + # write out config.ini + dic1 = flowcell.asdict() + flowcellConfig = os.path.join(_logDir, 'config.ini') + with open(flowcellConfig, 'w') as f: + dic1['config'].write(f) + # write out flowcellInfo.yaml + del dic1['config'] + yaml1 = ruamel.yaml.YAML() + yaml1.indent(mapping=2, sequence=4, offset=2) flowcellInfo = os.path.join(_logDir, 'flowcellInfo.yaml') - dic = flowcell.asdict() with open(flowcellInfo, 'w') as f: - ruamel.yaml.dump(dic, f) + yaml1.dump(dic1, f) diff --git a/src/dissectBCL/misc.py b/src/dissectBCL/misc.py index 33ea039..6881a91 100644 --- a/src/dissectBCL/misc.py +++ b/src/dissectBCL/misc.py @@ -156,7 +156,7 @@ def hamming(s1, s2): # We have some basket cases (multimodal) # Where barcode is nan (type as float) # Ignore these for now. - if type(s1) == float or type(s2) == float: + if isinstance(s1, float) or isinstance(s2, float): return 0 if s1 is None or s2 is None: return 0 diff --git a/src/tools/emailProjectFinished.py b/src/tools/emailProjectFinished.py index 5fd9670..0d9fd68 100755 --- a/src/tools/emailProjectFinished.py +++ b/src/tools/emailProjectFinished.py @@ -3,57 +3,27 @@ import sys import smtplib import os +import requests from dissectBCL.misc import getConf from email.mime.text import MIMEText import glob -def fetchFirstNameAndEmail(lastName, config): - # search in dictionary file defined in config for lastName - try: - fn = config['parkour']['userList'] - except KeyError: - print("Error: fetchFirstNameAndEmail\n\ - No dictionary defined. \ - Specify --toEmail and --toName explicitly!") - sys.exit(1) - - if not os.path.exists(fn): - print("{} does not exist!".format(fn)) - sys.exit(1) - - f = open(fn) - d = dict() - for line in f: - cols = line.rstrip().split("\t") - - # only accept format: firstName, lastName, email - if (len(cols) < 3): - continue - - # ignore all other lastNames - if cols[1] != lastName: - continue - - # check if lastName occurs more than once in list - if cols[1] in d: - print("Error: fetchFirstNameAndEmail\n\ - Name {} exists more than once. \ - Specify --toEmail and --toName explicitly!".format(cols[1])) - print('now: ', cols[1], cols[0], cols[2]) - print('previous: ', cols[1], d[cols[1]]) - sys.exit(1) - - # add to dictionary - d[cols[1]] = [cols[0], cols[2]] - f.close() - - if lastName not in d: - print("Error: fetchFirstNameAndEmail\n\ - No Information for lastName={}. {} needs update".format(lastName, fn)) - sys.exit(1) - - return d[lastName] +def getContactDetails(projectID, config): + """ + Retrieve user data from a given sequencing request + """ + res = requests.get( + config["parkour"]["URL"] + + "/api/requests/" + + projectID + + "/get_contact_details", + auth=(config["parkour"]["user"], config["parkour"]["password"]), + verify=config["parkour"]["cert"], + ) + if res.status_code != 200: + raise RuntimeError(f"API error: {res.json()}") + return res.json() def getProjectIDs(projects, config): @@ -62,7 +32,8 @@ def getProjectIDs(projects, config): # Sanity check assert (p.startswith("Project_")) IDs.append(p.split("_")[1]) - PI = p.split("_")[-1].lower() + # compound surnames use minus, we use 1st only. + PI = p.split("_")[-1].split("-")[0].lower() # Get the actual sequencing_data dir # Assume if multiple projects are given, they all in the same flowcell. flowcell = getFlowCell() @@ -164,10 +135,10 @@ def main(): if not os.path.exists(p): sys.exit("Project folder {} not found.".format(p)) - # get lastName (user) from project name - lastName = args.project[0].split("_")[2] + # get user from project name, lastName = args.project[0].split("_")[2] if not args.toEmail or not args.toName: - firstName, email = fetchFirstNameAndEmail(lastName, config) + my_dict = getContactDetails(args.project[0].split("_")[1], config) + firstName, email = my_dict["first_name"], my_dict["email"] else: firstName, email = args.toName, args.toEmail diff --git a/src/tools/prep_contaminome.py b/src/tools/prep_contaminome.py index 914d86b..1accbc5 100644 --- a/src/tools/prep_contaminome.py +++ b/src/tools/prep_contaminome.py @@ -12,14 +12,16 @@ 'human': ['NC_012920.1'], # human mito 'mouse': ['NC_005089.1'], # mouse mito 'fly': ['NC_024511.2'], # fly mito - 'aedes-aegypti': ['NC_035159.1'] # aedes mito + 'aedes-aegypti': ['NC_035159.1'], # aedes mito + 'zebrafish': ['NC_002333.2'] # zebrafish mito } rrna_mask = [ ('human', 'humanrrna'), ('mouse', 'mouserrna'), ('fly', 'flyrrna'), - ('aedes-aegypti', 'aedesaegyptirrna') + ('aedes-aegypti', 'aedesaegyptirrna'), + ('zebrafish', 'zebrafishrrna') ] taxmap = { @@ -68,11 +70,13 @@ 'humanrrna': [96061111, 9, 'species'], 'mouserrna': [100901111, 10, 'species'], 'aedesaegyptirrna': [71591111, 13, 'species'], + 'zebrafishrrna': [79551111, 13, 'species'], 'flyrrna': [72271111, 11, 'species'], 'humanmito': [96062222, 9, 'species'], 'mousemito': [100902222, 10, 'species'], 'flymito': [72272222, 11, 'species'], - 'aedesaegyptimito': [71592222, 13, 'species'] + 'aedesaegyptimito': [71592222, 13, 'species'], + 'zebrafishmito': [79552222, 13, 'species'] } diff --git a/src/wd40/release.py b/src/wd40/release.py index ebf30e5..9873a38 100644 --- a/src/wd40/release.py +++ b/src/wd40/release.py @@ -1,4 +1,6 @@ import os +import requests +from subprocess import check_output import sys import glob from pathlib import Path @@ -6,22 +8,12 @@ def fetchLatestSeqDir(pref, PI, postfix): - globStr = os.path.join( - pref, - PI, - postfix + '*' - ) + globStr = os.path.join(pref, PI, postfix + "*") if len(glob.glob(globStr)) == 1: return glob.glob(globStr)[0] else: maxFolder = 0 - for seqDir in glob.glob( - os.path.join( - pref, - PI, - postfix + '*' - ) - ): + for seqDir in glob.glob(os.path.join(pref, PI, postfix + "*")): try: seqInt = int(seqDir[-1]) except ValueError: @@ -29,17 +21,13 @@ def fetchLatestSeqDir(pref, PI, postfix): continue if seqInt > maxFolder: maxFolder = seqInt - return (os.path.join( - pref, - PI, - postfix + str(maxFolder) - )) + return os.path.join(pref, PI, postfix + str(maxFolder)) def fetchFolders(flowcellPath, piList, prefix, postfix): institute_PIs = piList flowcellPath = os.path.abspath(flowcellPath) - FID = flowcellPath.split('/')[-1] + FID = flowcellPath.split("/")[-1] projDic = {} try: int(FID[:6]) @@ -48,33 +36,26 @@ def fetchFolders(flowcellPath, piList, prefix, postfix): sys.exit( "First 6 digits of flowcellpath don't convert to an int. Exiting." ) - for projF in glob.glob( - os.path.join( - flowcellPath, - 'Project_*' - ) - ): - proj = projF.split('/')[-1] + for projF in glob.glob(os.path.join(flowcellPath, "Project_*")): + proj = projF.split("/")[-1] PI = proj.split("_")[-1].lower() - if PI == 'cabezas-wallscheid': - PI = 'cabezas' + if PI == "cabezas-wallscheid": + PI = "cabezas" if PI in institute_PIs: seqFolder = fetchLatestSeqDir(prefix, PI, postfix) - if os.path.exists( - os.path.join(seqFolder, FID) - ): + if os.path.exists(os.path.join(seqFolder, FID)): projDic[proj] = [ - PI + 'grp', + PI + "grp", [ os.path.join(seqFolder, FID), os.path.join(seqFolder, FID, proj), - os.path.join(seqFolder, FID, 'FASTQC_' + proj), + os.path.join(seqFolder, FID, "FASTQC_" + proj), os.path.join( seqFolder, FID, - 'Analysis_' + proj.replace('Project_', '') - ) - ] + "Analysis_" + proj.replace("Project_", ""), + ), + ], ] else: print( @@ -107,12 +88,8 @@ def release_folder(grp, lis): succes_fqc = release_rights(fastqcF, grp) if os.path.exists(analysisF): succes_analysis = release_rights(analysisF, grp) - return ( - [succes_project, succes_fqc, succes_analysis] - ) - return ( - [succes_project, succes_fqc] - ) + return [succes_project, succes_fqc, succes_analysis] + return [succes_project, succes_fqc] def release_rights(F, grp): @@ -122,10 +99,7 @@ def release_rights(F, grp): for r, dirs, files in os.walk(F): for d in dirs: try: - os.chmod( - os.path.join(r, d), - 0o750 - ) + os.chmod(os.path.join(r, d), 0o750) changed += 1 except PermissionError: print("Permission error for {}".format(d)) @@ -136,10 +110,7 @@ def release_rights(F, grp): if grp != Path(fil).group(): grouperror = True try: - os.chmod( - fil, - 0o750 - ) + os.chmod(fil, 0o750) changed += 1 except PermissionError: print("Permission error for {}".format(f)) @@ -151,37 +122,75 @@ def release_rights(F, grp): F ) ) - return (successRate) + return successRate -def rel(flowcellPath, piList, prefix, postfix): - projDic = fetchFolders( - flowcellPath, - piList, - prefix, - postfix - ) +def rel( + flowcellPath, + piList, + prefix, + postfix, + parkourURL, + parkourAuth, + parkourCert, + fexBool, + fromAddress, +): + projDic = fetchFolders(flowcellPath, piList, prefix, postfix) print("Print number of changed/(changed+unchanged)!") for proj in projDic: - ''' + """ every projDic[proj] is a nested list of: [grp, [flowcell, project, fastqc]] - ''' + """ successes = release_folder(projDic[proj][0], projDic[proj][1]) if len(successes) == 2: print( "[green]Project[/green] {},{} proj,{} fqc".format( - proj, - successes[0], - successes[1] + proj, successes[0], successes[1] ) ) else: print( "[green]Project[/green] {},{} proj,{} fqc,{} analysis".format( - proj, - successes[0], - successes[1], - successes[2] + proj, successes[0], successes[1], successes[2] ) ) + projectPath = projDic[proj][1][1].split("/")[-1] + PI = ( + projectPath.split("_")[-1] + .lower() + .replace("cabezas-wallscheid", "cabezas") + ) + d = None + if PI in piList: + d = { + "data": projDic[proj][1][1], + "metadata": projDic[proj][1][1] + "/multiqc_report.html", + } + elif fexBool: + fexList = ( + check_output(["fexsend", "-l", fromAddress]) + .decode("utf-8") + .replace("\n", " ") + .split(" ") + ) + tar_lane, tar_proj = projDic[proj][1][1].split("/")[-2:] + tarBall = tar_lane + "_" + tar_proj + ".tar" + if tarBall in fexList: + d = {"data": tarBall, "metadata": None} + else: + print("fexLink: ", tarBall, " not found!") + if d: + print( + "Adding filepaths to Parkour2:", + requests.post( + parkourURL + + "/api/requests/" + + proj.split("_")[1] + + "/put_filepaths/", + auth=parkourAuth, + data=d, + verify=parkourCert, + ), + ) # print the returned answer from the API diff --git a/src/wd40/wd40.py b/src/wd40/wd40.py index 017cfaa..6789cfc 100644 --- a/src/wd40/wd40.py +++ b/src/wd40/wd40.py @@ -73,6 +73,14 @@ def cli(ctx, configpath, debug): ctx.obj['postfixDir'] = cnf['Internals']['seqDir'] ctx.obj['fastqDir'] = cnf['Dirs']['outputDir'] ctx.obj['solDir'] = cnf['Dirs']['baseDir'] + ctx.obj['parkourURL'] = cnf['parkour']['URL'] + ctx.obj['parkourAuth'] = ( + cnf['parkour']['user'], + cnf['parkour']['password'] + ) + ctx.obj['parkourCert'] = cnf['parkour']['cert'] + ctx.obj['fexBool'] = cnf['Internals'].getboolean('fex') + ctx.obj['fromAddress'] = cnf['communication']['fromAddress'] @cli.command() @@ -88,7 +96,12 @@ def rel(ctx, flowcell): flowcell, ctx.obj['piList'], ctx.obj['prefixDir'], - ctx.obj['postfixDir'] + ctx.obj['postfixDir'], + ctx.obj['parkourURL'], + ctx.obj['parkourAuth'], + ctx.obj['parkourCert'], + ctx.obj['fexBool'], + ctx.obj['fromAddress'] )