Keeping dependencies up-to-date, clearing deprecations, bugfixes, and…

… better parkour integration (#183)
maxplanck-ie · Mar 27, 2024 · 9f11e76 · 9f11e76
2 parents 9475bbe + ea1f607
commit 9f11e76
Show file tree

Hide file tree

Showing 12 changed files with 190 additions and 148 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,10 +1,29 @@
 CHANGES
 =======
 
+* split up zebrafish contamination into mito - rrna - zebrafish
+* or it wasn't found on fexList // let's just omit info, code is the doc (?)
+* autoreformatted for flake8 rule
+* fixes
+* works in my server :P
+* FutureWarning: Calling int on a single element Series is deprecated and will raise a TypeError in the future
+* flake8 fix
+* URL should be complete
+* use 1 config param (URL) instead of 3
+* flake8 E302
+* get\_contact\_details API endpoint // deprecated userList text file
+* Lanesplit samples (#173)
+* make lanesplit check on sampleIDs rather then samplenames, and lane aware
+* Mycoplasma implement, email update (#172)
+* Contam emails (#171)
+* ChangeLog
 * actual seq\_data dir in email
 * mycoplasma include in prep
 * include mycoplasma hyorhinis
+* docs updates (#170)
+* auto version for docs, fix readthedocs yaml (#169)
 * auto version for docs, fix readthedocs yaml
+* Docs (#168)
 * update changelog
 * include authors
 * make sure doc pytest includes reqs from the doc folder

diff --git a/contaminome.yml b/contaminome.yml
@@ -162,6 +162,11 @@ rrna:
    vulgarname: aedesaegyptirrna
    accession: NC_035159.1
    taxid: 71591111
+  Zebrafish rRNA:
+    URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/zebrarRNA.fna.gz
+    vulgarname: zebrafishrrna
+    accession: NR_145818.1
+    taxid: 79551111
 mito:
   Homo sapiens mitochondrion:
    URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/humanmito.fna.gz
@@ -182,4 +187,9 @@ mito:
      URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/aedesaegyptimito.fna.gz
      vulgarname: aedesaegyptimito
      accession: NC_035159.1
-     taxid: 71592222
+     taxid: 71592222
+  Zebrafish mitochondrion:
+     URL: https://raw.githubusercontent.com/WardDeb/customcontamination/main/fna/zebramito.fna.gz
+     vulgarname: zebrafishmito
+     accession: NC_002333.2
+     taxid: 79552222
diff --git a/dissectBCL.ini b/dissectBCL.ini
@@ -13,12 +13,10 @@ seqDir=seqfolderstr
 fex=False
 
 [parkour]
-pullURL=parkour.pull.url/api
-pushURL=parkour.push.url/api
 user=parkourUser
 password=parkourPw
 cert=/path/to/cert.pem
-userList=filename_with_parkour_users
+URL=parkour.domain.tld
 
 [software]
 bclconvert=/path/to/bclconvert
@@ -30,6 +28,7 @@ mpiImg=/path/to/multiqc_headerimg.jpg
 krakenExpl="<font size="2"> Kraken is used to classify the reads and to detect contamination. <br> For this we use a *custom* database, with a simplified taxonomical hierarchy (that no longer resembles any true taxonomical classification. <br> In brief, by default we screen for: <li><b>eukaryotes</b> (human, mouse, fly, mosquito, lamprey, medaka, c-elegans, yeast, zebrafish and the moss-piglet)</li> <li><b>prokaryotes</b> (Ecoli, pseudomonas, mycoplasma and haemophilus influenza)</li> <li><b>viruses</b> (sars-cov2, influenza A,B & C, norwalk virus, rhinoviruses, drosophila C virus, phiX and lambda phage )</li> <li><b>custom databases</b> (ERCC spikes, univec core DB)</li> Note that for human, mouse, fly and mosquito we scan for mitochondrial and ribosomal contamination separately). <br> Only the top (most abundant) five hits and unclassified hits are shown, all other hits are grouped under an 'other' tag.</font>"   
 
 [communication]
+subject=dissectBCL
 deepSeq[email protected]
 bioinfoCore[email protected]
 fromAddress[email protected]

diff --git a/docs/config.rst b/docs/config.rst
@@ -63,14 +63,10 @@ parkour
 The *parkour block* contains all necessary information to communicate with `parkour <https://github.com/maxplanck-ie/parkour2>`.
 Note that this block contains sensitive information.
 
-#. pullURL: the URL to pull flowcell information from. Is parkoururl/api/analysis_list/analysis_list
-#. pushURL: the URL to push flowcell statistics to. Is parkoururl/api/run_statistics/upload
 #. user: the username for API requests
 #. pw: the password for API requests
 #. cert: the pem certificate for API requests
-#. userList: a headerless tsv file containing firstname lastname emailaddress lines.
-
-Note that the userList is used implicitly for the email command to notify end users.
+#. URL: the URL to Parkour2, e.g. `https://parkour.yourdomain.tld`.
 
 .. _software:
 
@@ -128,7 +124,7 @@ example
     user=parkourUser
     password=parkourPw
     cert=/path/to/cert.pem
-    userList=filename_with_parkour_users
+    URL=parkour.domain.tld
 
     [software]
     bclconvert=/path/to/bclconvert

diff --git a/src/dissectBCL/classes.py b/src/dissectBCL/classes.py
@@ -179,11 +179,18 @@ def decideSplit(self):
         laneSplitStatus = True
         # Do we need lane splitting or not ?
         # If there is at least one sample in more then 1 lane, we cannot split:
-        if sum(self.fullSS['Sample_Name'].value_counts() > 1) > 0:
-            logging.info(
-                "No lane splitting: >= 1 sample in multiple lanes."
-            )
-            laneSplitStatus = False
+        samples = list(self.fullSS['Sample_ID'].unique())
+        for _s in samples:
+            if len(
+                list(self.fullSS[
+                    self.fullSS['Sample_ID'] == _s
+                ]['Lane'].unique()
+                )
+            ) > 1:
+                logging.info(
+                    "No lane splitting: >= 1 sample in multiple lanes."
+                )
+                laneSplitStatus = False
         # If one project is split over multiple lanes, we also don't split:
         projects = list(self.fullSS['Sample_Project'].unique())
         for project in projects:

diff --git a/src/dissectBCL/drHouse.py b/src/dissectBCL/drHouse.py
@@ -142,7 +142,11 @@ def initClass(
     muxDF = pd.read_csv(muxPath)
     totalReads = int(muxDF['# Reads'].sum())
     if len(muxDF[muxDF['SampleID'] == 'Undetermined']) == 1:
-        undReads = int(muxDF[muxDF['SampleID'] == 'Undetermined']['# Reads'])
+        undReads = int(
+            muxDF[
+                muxDF['SampleID'] == 'Undetermined'
+            ]['# Reads'].iloc[0]
+        )
     else:
         undDic = dict(
             muxDF[

diff --git a/src/dissectBCL/fakeNews.py b/src/dissectBCL/fakeNews.py
@@ -48,7 +48,7 @@ def pullParkour(flowcellID, config):
     )
     d = {'flowcell_id': FID}
     res = requests.get(
-        config['parkour']['pullURL'],
+        config['parkour']['URL'] + '/api/analysis_list/analysis_list/',
         auth=(
             config['parkour']['user'],
             config['parkour']['password']
@@ -185,7 +185,7 @@ def pushParkour(flowcellID, sampleSheet, config, flowcellBase):
     d['matrix'] = json.dumps(list(laneDict.values()))
     logging.info("Pushing FID with dic {} {}".format(FID, d))
     pushParkStat = requests.post(
-        config.get("parkour", "pushURL"),
+        config.get("parkour", "URL") + '/api/run_statistics/upload/',
         auth=(
             config.get("parkour", "user"),
             config.get("parkour", "password")
@@ -360,7 +360,8 @@ def multiQC_yaml(config, flowcell, ssDic, project, laneFolder):
 
 def mailHome(subject, _html, config, toCore=False):
     mailer = MIMEMultipart('alternative')
-    mailer['Subject'] = '[dissectBCL] [{}] '.format(
+    mailer['Subject'] = '[{}] [{}] '.format(
+        config['communication']['subject'],
         version('dissectBCL')
     ) + subject
     mailer['From'] = config['communication']['fromAddress']
@@ -580,21 +581,30 @@ def organiseLogs(flowcell, sampleSheet):
                     mvFile
                 )
                 shutil.move(fileIn, fileOut)
+
         # Write out ssdf.
         outssdf = os.path.join(_logDir, 'sampleSheetdf.tsv')
         sampleSheet.ssDic[outLane]['sampleSheet'].to_csv(outssdf, sep='\t')
-        # Write out the yaml files.
-        yaml = ruamel.yaml.YAML()
-        yaml.indent(mapping=2, sequence=4, offset=2)
 
         # write out outLaneInfo.yaml
+        dic0 = sampleSheet.ssDic[outLane]
+        del dic0['sampleSheet']
+        yaml0 = ruamel.yaml.YAML()
+        yaml0.indent(mapping=2, sequence=4, offset=2)
         outLaneInfo = os.path.join(_logDir, 'outLaneInfo.yaml')
-        dic = sampleSheet.ssDic[outLane]
-        del dic['sampleSheet']
         with open(outLaneInfo, 'w') as f:
-            ruamel.yaml.dump(dic, f)
+            yaml0.dump(dic0, f)
+
+        # write out config.ini
+        dic1 = flowcell.asdict()
+        flowcellConfig = os.path.join(_logDir, 'config.ini')
+        with open(flowcellConfig, 'w') as f:
+            dic1['config'].write(f)
+
         # write out flowcellInfo.yaml
+        del dic1['config']
+        yaml1 = ruamel.yaml.YAML()
+        yaml1.indent(mapping=2, sequence=4, offset=2)
         flowcellInfo = os.path.join(_logDir, 'flowcellInfo.yaml')
-        dic = flowcell.asdict()
         with open(flowcellInfo, 'w') as f:
-            ruamel.yaml.dump(dic, f)
+            yaml1.dump(dic1, f)
diff --git a/src/dissectBCL/misc.py b/src/dissectBCL/misc.py
@@ -156,7 +156,7 @@ def hamming(s1, s2):
     # We have some basket cases (multimodal)
     # Where barcode is nan (type as float)
     # Ignore these for now.
-    if type(s1) == float or type(s2) == float:
+    if isinstance(s1, float) or isinstance(s2, float):
         return 0
     if s1 is None or s2 is None:
         return 0

diff --git a/src/tools/emailProjectFinished.py b/src/tools/emailProjectFinished.py
@@ -3,57 +3,27 @@
 import sys
 import smtplib
 import os
+import requests
 from dissectBCL.misc import getConf
 from email.mime.text import MIMEText
 import glob
 
 
-def fetchFirstNameAndEmail(lastName, config):
-    # search in dictionary file defined in config for lastName
-    try:
-        fn = config['parkour']['userList']
-    except KeyError:
-        print("Error: fetchFirstNameAndEmail\n\
-        No dictionary defined. \
-        Specify --toEmail and --toName explicitly!")
-        sys.exit(1)
-
-    if not os.path.exists(fn):
-        print("{} does not exist!".format(fn))
-        sys.exit(1)
-
-    f = open(fn)
-    d = dict()
-    for line in f:
-        cols = line.rstrip().split("\t")
-
-        # only accept format: firstName, lastName, email
-        if (len(cols) < 3):
-            continue
-
-        # ignore all other lastNames
-        if cols[1] != lastName:
-            continue
-
-        # check if lastName occurs more than once in list
-        if cols[1] in d:
-            print("Error: fetchFirstNameAndEmail\n\
-            Name {} exists more than once. \
-            Specify --toEmail and --toName explicitly!".format(cols[1]))
-            print('now:      ', cols[1], cols[0], cols[2])
-            print('previous: ', cols[1], d[cols[1]])
-            sys.exit(1)
-
-        # add to dictionary
-        d[cols[1]] = [cols[0], cols[2]]
-    f.close()
-
-    if lastName not in d:
-        print("Error: fetchFirstNameAndEmail\n\
-    No Information for lastName={}. {} needs update".format(lastName, fn))
-        sys.exit(1)
-
-    return d[lastName]
+def getContactDetails(projectID, config):
+    """
+    Retrieve user data from a given sequencing request
+    """
+    res = requests.get(
+        config["parkour"]["URL"]
+        + "/api/requests/"
+        + projectID
+        + "/get_contact_details",
+        auth=(config["parkour"]["user"], config["parkour"]["password"]),
+        verify=config["parkour"]["cert"],
+    )
+    if res.status_code != 200:
+        raise RuntimeError(f"API error: {res.json()}")
+    return res.json()
 
 
 def getProjectIDs(projects, config):
@@ -62,7 +32,8 @@ def getProjectIDs(projects, config):
         # Sanity check
         assert (p.startswith("Project_"))
         IDs.append(p.split("_")[1])
-        PI = p.split("_")[-1].lower()
+        # compound surnames use minus, we use 1st only.
+        PI = p.split("_")[-1].split("-")[0].lower()
     # Get the actual sequencing_data dir
     # Assume if multiple projects are given, they all in the same flowcell.
     flowcell = getFlowCell()
@@ -164,10 +135,10 @@ def main():
         if not os.path.exists(p):
             sys.exit("Project folder {} not found.".format(p))
 
-    # get lastName (user) from project name
-    lastName = args.project[0].split("_")[2]
+    # get user from project name, lastName = args.project[0].split("_")[2]
     if not args.toEmail or not args.toName:
-        firstName, email = fetchFirstNameAndEmail(lastName, config)
+        my_dict = getContactDetails(args.project[0].split("_")[1], config)
+        firstName, email = my_dict["first_name"], my_dict["email"]
     else:
         firstName, email = args.toName, args.toEmail
 

diff --git a/src/tools/prep_contaminome.py b/src/tools/prep_contaminome.py
@@ -12,14 +12,16 @@
     'human': ['NC_012920.1'],  # human mito
     'mouse': ['NC_005089.1'],  # mouse mito
     'fly': ['NC_024511.2'],  # fly mito
-    'aedes-aegypti': ['NC_035159.1']  # aedes mito
+    'aedes-aegypti': ['NC_035159.1'],  # aedes mito
+    'zebrafish': ['NC_002333.2']  # zebrafish mito
 }
 
 rrna_mask = [
     ('human', 'humanrrna'),
     ('mouse', 'mouserrna'),
     ('fly', 'flyrrna'),
-    ('aedes-aegypti', 'aedesaegyptirrna')
+    ('aedes-aegypti', 'aedesaegyptirrna'),
+    ('zebrafish', 'zebrafishrrna')
 ]
 
 taxmap = {
@@ -68,11 +70,13 @@
     'humanrrna': [96061111, 9, 'species'],
     'mouserrna': [100901111, 10, 'species'],
     'aedesaegyptirrna': [71591111, 13, 'species'],
+    'zebrafishrrna': [79551111, 13, 'species'],
     'flyrrna': [72271111, 11, 'species'],
     'humanmito': [96062222, 9, 'species'],
     'mousemito': [100902222, 10, 'species'],
     'flymito': [72272222, 11, 'species'],
-    'aedesaegyptimito': [71592222, 13, 'species']
+    'aedesaegyptimito': [71592222, 13, 'species'],
+    'zebrafishmito': [79552222, 13, 'species']
     }