DATAOPS-779: Fix undetermined percentage handler (#117)

* Add nan phix values support to %Undetermined handler * Add .vscode to gitignore and bump version * Add new miseq testdata with index reads * Use MiSeq dataset with index in interop parser tests * Remove MiSeqDemo * Restructure mean phix calculation to avoid numpy warning * Add pull requests as GHA trigger * Make interop dict for Receiver in interop test
Molmed · Sep 12, 2024 · 5f91e1b · 5f91e1b
1 parent dfba84e
commit 5f91e1b
Show file tree

Hide file tree

Showing 46 changed files with 30,230 additions and 151 deletions.
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -1,6 +1,6 @@
 name: Run Unit Tests
 
-on: [push]
+on: [push, pull_request]
 
 jobs:
   build:

diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ dist/
 .python-env/
 .coverage
 .cache
+.vscode/
diff --git a/checkQC/__init__.py b/checkQC/__init__.py
@@ -1,2 +1,2 @@
 
-__version__ = "4.0.3"
+__version__ = "4.0.5-rc1"
diff --git a/checkQC/handlers/undetermined_percentage_handler.py b/checkQC/handlers/undetermined_percentage_handler.py
@@ -1,5 +1,6 @@
 
 from collections import defaultdict
+import numpy as np
 
 from checkQC.handlers.qc_handler import QCHandler, QCErrorFatal, QCErrorWarning
 from checkQC.parsers.stats_json_parser import StatsJsonParser
@@ -33,13 +34,12 @@ def collect(self, signal):
             self.phix_aligned[value["lane"]][value["read"]] = value["percent_phix"]
 
     def _compute_mean_percentage_phix_aligned_for_lanes(self):
-        lane_and_mean_percentage_phix_aliged = {}
+        lane_and_mean_percentage_phix_aligned = {}
         for lane, reads in self.phix_aligned.items():
-            mean = 0
-            for read, value in reads.items():
-                mean += value / len(reads)
-            lane_and_mean_percentage_phix_aliged[lane] = mean
-        return lane_and_mean_percentage_phix_aliged
+            reads_list = list(reads.values())
+            mean_phix = 0 if all(np.isnan(reads_list)) else np.nanmean(reads_list)
+            lane_and_mean_percentage_phix_aligned[lane] = mean_phix
+        return lane_and_mean_percentage_phix_aligned
 
     def check_qc(self):
 
@@ -74,13 +74,13 @@ def create_data_dict(value):
 
                 if self.error() != self.UNKNOWN and percentage_undetermined > compute_threshold(self.error()):
                     yield QCErrorFatal("The percentage of undetermined indexes was"
-                                       " to high on lane {}, it was: {:.2f}%".format(lane_nbr,
+                                       " too high on lane {}, it was: {:.2f}%".format(lane_nbr,
                                                                                      percentage_undetermined),
                                        ordering=lane_nbr,
                                        data=create_data_dict(self.error()))
                 elif self.warning() != self.UNKNOWN and percentage_undetermined > compute_threshold(self.warning()):
                     yield QCErrorWarning("The percentage of undetermined indexes was "
-                                         "to high on lane {}, it was: {:.2f}%".format(lane_nbr,
+                                         "too high on lane {}, it was: {:.2f}%".format(lane_nbr,
                                                                                       percentage_undetermined),
                                          ordering=lane_nbr,
                                          data=create_data_dict(self.warning()))

diff --git a/checkQC/parsers/interop_parser.py b/checkQC/parsers/interop_parser.py
@@ -148,11 +148,6 @@ def run(self):
         lanes = summary.lane_count()
 
         for lane in range(lanes):
-            # The interop library uses zero based indexing, 
-            #however most people uses read 1/2
-            # to denote the different reads, 
-            #this enumeration is used to transform from
-            # zero based indexing to this form. /JD 2017-10-27
             for read_nbr in range(summary.size()):
                 read = summary.at(read_nbr).at(lane)
                 error_rate = read.error_rate().mean()

diff --git a/tests/handlers/test_undetermined_percentage_handler.py b/tests/handlers/test_undetermined_percentage_handler.py
@@ -1,4 +1,5 @@
 import unittest
+import numpy as np
 
 from checkQC.handlers.undetermined_percentage_handler import UndeterminedPercentageHandler
 
@@ -17,9 +18,9 @@ def setUp(self):
 
         percentage_phix_key = "percent_phix"
         percentage_phix_value_lane_1_read_1 = {"lane": 1, "read": 1, "percent_phix": 1}
-        percentage_phix_value_lane_1_read_2 = {"lane": 1, "read": 2, "percent_phix": 1}
-        percentage_phix_value_lane_2_read_1 = {"lane": 2, "read": 1, "percent_phix": 1}
-        percentage_phix_value_lane_2_read_2 = {"lane": 2, "read": 2, "percent_phix": 1}
+        percentage_phix_value_lane_1_read_2 = {"lane": 1, "read": 2, "percent_phix": np.nan}
+        percentage_phix_value_lane_2_read_1 = {"lane": 2, "read": 1, "percent_phix": np.nan}
+        percentage_phix_value_lane_2_read_2 = {"lane": 2, "read": 2, "percent_phix": np.nan}
         undetermined_handler.collect((percentage_phix_key, percentage_phix_value_lane_1_read_1))
         undetermined_handler.collect((percentage_phix_key, percentage_phix_value_lane_1_read_2))
         undetermined_handler.collect((percentage_phix_key, percentage_phix_value_lane_2_read_1))
@@ -37,7 +38,7 @@ def test_all_is_fine(self):
         self.assertEqual(errors_and_warnings, [])
 
     def test_warning(self):
-        qc_config = {'name': 'UndeterminedPercentageHandler', 'error': 2, 'warning': 1}
+        qc_config = {'name': 'UndeterminedPercentageHandler', 'error': 3, 'warning': 1}
         self.set_qc_config(qc_config)
         errors_and_warnings = list(self.undetermined_handler.check_qc())
         self.assertEqual(len(errors_and_warnings), 2)

diff --git a/tests/parsers/test_interop_parser.py b/tests/parsers/test_interop_parser.py
@@ -15,93 +15,105 @@ class TestInteropParser(unittest.TestCase):
 
     class Receiver(object):
         def __init__(self):
-            self.error_rate_values = []
-            self.percent_q30_values = []
-            self.percent_q30_per_cycle = []
+            self.metrics = {'error_rate': [],
+                            'percent_q30': [],
+                            'percent_q30_per_cycle': [],
+                            'percent_phix': [],
+                            }
             self.subscriber = self.subscribe()
             next(self.subscriber)
 
         def subscribe(self):
             while True:
                 interop_stat = yield
                 key = list(interop_stat)[0]
-                if key == "error_rate":
-                    self.error_rate_values.append(interop_stat)
-                if key == "percent_q30":
-                    self.percent_q30_values.append(interop_stat)
-                if key == "percent_q30_per_cycle":
-                    self.percent_q30_per_cycle.append(interop_stat)
+                self.metrics[key].append(interop_stat)
 
         def send(self, value):
             self.subscriber.send(value)
 
-    runfolder = os.path.join(os.path.dirname(__file__), "..", 
+    runfolder = os.path.join(os.path.dirname(__file__), "..",
                              "resources",
-                             "MiSeqDemo")
-    interop_parser = InteropParser(runfolder=runfolder, 
+                             "230825_M04034_0043_000000000-L6NVV")
+    interop_parser = InteropParser(runfolder=runfolder,
                                    parser_configurations=None)
     subscriber = Receiver()
     interop_parser.add_subscribers(subscriber)
     interop_parser.run()
 
     def test_read_error_rate(self):
-        self.assertListEqual(self.subscriber.error_rate_values,
-                             [('error_rate', 
-                                {'lane': 1, 
-                                 'read': 1, 
-                                 'error_rate': 1.5317546129226685}),
-                              ('error_rate',
-                                {'lane': 1,
-                                 'read': 2,
-                                 'error_rate': 1.9201501607894897})])
-
+        error_rates = [x[1]['error_rate'] for x in self.subscriber.metrics['error_rate']]
+        self.assertEqual(error_rates[0], 0.587182343006134)
+        self.assertTrue(np.isnan(error_rates[1]))
+        self.assertTrue(np.isnan(error_rates[2]))
+        self.assertEqual(error_rates[3], 0.8676796555519104)
+
+    def test_percent_phix(self):
+        phix = [x[1]['percent_phix'] for x in self.subscriber.metrics['percent_phix']]
+        self.assertEqual(phix[0], 15.352058410644531)
+        self.assertTrue(np.isnan(phix[1]))
+        self.assertTrue(np.isnan(phix[2]))
+        self.assertEqual(phix[3], 14.5081205368042)
 
     def test_percent_q30(self):
-        self.assertListEqual(self.subscriber.percent_q30_values,
-                             [('percent_q30', 
-                               {'lane': 1, 
-                                'read': 1, 
-                                'percent_q30': 93.42070007324219, 
+        self.assertListEqual(self.subscriber.metrics['percent_q30'],
+                             [('percent_q30',
+                               {'lane': 1,
+                                'read': 1,
+                                'percent_q30': 95.3010025024414,
                                 'is_index_read': False}),
-                              ('percent_q30', 
-                               {'lane': 1, 
-                                'read': 2, 
-                                'percent_q30': 84.4270248413086, 
+                              ('percent_q30',
+                               {'lane': 1,
+                                'read': 2,
+                                'percent_q30': 82.97042846679688,
+                                'is_index_read': True}),
+                              ('percent_q30',
+                               {'lane': 1,
+                                'read': 3,
+                                'percent_q30': 97.44789123535156,
+                                'is_index_read': True}),
+                              ('percent_q30',
+                               {'lane': 1,
+                                'read': 4,
+                                'percent_q30': 90.55824279785156,
                                 'is_index_read': False})])
-        
-    def test_percent_q30_per_cycle(self):
-        percent_q30_per_cycle = self.subscriber.percent_q30_per_cycle
+
+    def test_percent_q30_per_cycle_subscriber_output(self):
+        percent_q30_per_cycle = self.subscriber.metrics['percent_q30_per_cycle']
         self.assertEqual(percent_q30_per_cycle[0][1]['read'], 1)
         self.assertAlmostEqual(
             percent_q30_per_cycle[0][1]['percent_q30_per_cycle'][10],
-            98.41526794433594
+            96.68322,
+            places=5,
         )
 
         self.assertEqual(percent_q30_per_cycle[1][1]['read'], 2)
+        self.assertTrue(percent_q30_per_cycle[1][1]['is_index_read'])
         self.assertAlmostEqual(
-            percent_q30_per_cycle[1][1]['percent_q30_per_cycle'][10],
-            95.20341491699219
+            percent_q30_per_cycle[1][1]['percent_q30_per_cycle'][1],
+            80.69179,
+            places=5,
         )
-    
+
     def test_get_percent_q30_per_cycle(self):
         q_metrics = imaging(self.runfolder,
               valid_to_load=['Q'])
-        
+
         percent_q30_per_cycle = InteropParser.get_percent_q30_per_cycle(
                 q_metrics=q_metrics,
-                lane_nr=0, 
+                lane_nr=0,
                 read_nr=0,
                 is_index_read=False,
         )
 
         expected_out = {
-                6: 98.76343,
-                48: 97.841576,
-                90: 96.81421,
-                132: 95.90264,
-                174: 94.69448,
-                216: 91.90525,
-                258: 87.162094,
+                6: 97.17214,
+                18: 97.1332,
+                25: 97.38965,
+                50: 96.62786,
+                75: 96.30572,
+                100: 94.63465,
+                136: 92.64536,
         }
 
         #Select cycles from the expected_out-dict.
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@

		__version__ = "4.0.3"
		__version__ = "4.0.5-rc1"