biocore · amnona · Dec 23, 2024 · Dec 24, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # calour changelog
 
+## Version 2024.12.23
+Bug Fixes:
+* fix problem arising from inexact numpy matrix multiplication that resulted in more stringent dsFDR than requested when using the default 'meandiff' method
 ## Version 2024.8.31
 New features:
 * add shuffler optional parameter and improved data validation for analysis.correlation()

diff --git a/calour/__init__.py b/calour/__init__.py
@@ -19,7 +19,7 @@
 
 
 __credits__ = "https://github.com/biocore/calour/graphs/contributors"
-__version__ = "2024.8.31"
+__version__ = "2024.12.23"
 
 __all__ = ['read', 'read_amplicon', 'read_ms', 'read_qiime2',
            'Experiment', 'AmpliconExperiment', 'MS1Experiment','mRNAExperiment',

diff --git a/calour/dsfdr.py b/calour/dsfdr.py
@@ -199,30 +199,51 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
         raise ValueError('transform type %s not supported' % transform_type)
 
     numbact = np.shape(data)[0]
-
+    numsamples = np.shape(data)[1]
     labels = labels.copy()
 
-    numbact = np.shape(data)[0]
-    labels = labels.copy()
+    if len(labels) != numsamples:
+        raise ValueError('number of labels (%d) and data samples (%d) do not match' % (len(labels), numsamples))
 
-    logger.debug('start permutation')
+    logger.debug('starting permutations')
     if method == 'meandiff':
         # fast matrix multiplication based calculation
-        method = meandiff
-        tstat = method(data, labels)
-        t = np.abs(tstat)
-        numsamples = np.shape(data)[1]
-        p = np.zeros([numsamples, numperm])
-        k1 = 1 / np.sum(labels == 0)
-        k2 = 1 / np.sum(labels == 1)
-        for cperm in range(numperm):
-            labels = shuffler(labels)
-            p[labels == 0, cperm] = k1
-        p2 = np.ones(p.shape) * k2
-        p2[p > 0] = 0
-        mean1 = np.dot(data, p)
-        mean2 = np.dot(data, p2)
-        u = np.abs(mean1 - mean2)
+
+        # the first column is the original data
+        m0 = np.zeros([numsamples, numperm+1])
+
+        k0 = 1 / np.sum(labels == 0)
+        k1 = 1 / np.sum(labels == 1)
+        for cperm in range(numperm+1):
+            # keep the first column as the original data
+            if cperm > 0:
+                rlabels = shuffler(labels)
+            else:
+                rlabels = labels
+            m0[rlabels == 0, cperm] = k0
+        m1 = np.ones(m0.shape) * k1
+        m1[m0 > 0] = 0
+        mean0 = np.dot(data, m0)
+        mean1 = np.dot(data, m1)
+        u = mean1 - mean0
+
+        tstat = u[:, 0]
+
+        # fix floating point error from martix multiplication for all entries
+        # problem is that mean for permutations that should have same mean, is very close but not identical (due to numpy matrix multiplication)
+        # important for permutation values since we use ranking and need to ignore close values
+        # error is of the order of 1E-9
+        # so we normalize by the max value in each row and then round to 9 digits
+        max_vals = np.max(np.abs(u), axis=1)
+        if np.any(max_vals == 0):
+            raise ValueError('data contains features with 0 values in all samples')
+        u = u / max_vals[:,np.newaxis]
+        u = np.round(u, 9)
+
+        u = np.abs(u)
+        t = u[:, 0]
+        u = u[:, 1:]
+
     elif method == 'mannwhitney' or method == \
                    'kruwallis' or method == 'stdmeandiff':
         if method == 'mannwhitney':
@@ -337,6 +358,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
 
     # calculate FDR
     if fdr_method == 'dsfdr':
+        logger.debug('calculating dsFDR')
         # sort unique p-values for original test statistics biggest to smallest
         pvals_unique = np.unique(pvals)
         sortp = pvals_unique[np.argsort(-pvals_unique)]
@@ -358,6 +380,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
 
         if not foundit:
             # no good threshold was found
+            logger.debug('no good threshold was found')
             reject = np.repeat([False], numbact)
             return reject, tstat, pvals, qvals
 

diff --git a/calour/tests/test_analysis.py b/calour/tests/test_analysis.py
@@ -9,6 +9,7 @@
 import unittest
 
 import numpy as np
+import pandas.testing as pdt
 
 import calour as ca
 from calour.analysis import diff_abundance, diff_abundance_paired
@@ -46,6 +47,12 @@ def test_diff_abundance(self):
         self.assertEqual(len(dd.feature_metadata), 7)
         for cid in expected_ids:
             self.assertIn(self.test1.feature_metadata.index[cid], dd.feature_metadata.index)
+
+    def test_diff_abundance_fast_vs_slow(self):
+        dd_fast = diff_abundance(self.test1, 'group', val1='1', val2='2', random_seed=2017, method='meandiff')
+        dd_slow = diff_abundance(self.test1, 'group', val1='1', val2='2', random_seed=2017, method=ca.dsfdr.meandiff)
+        # assert whether the XX.feature_metadata DataFrames as equal
+        pdt.assert_frame_equal(dd_fast.feature_metadata, dd_slow.feature_metadata,check_like=True,check_exact=False, atol=1e-2)
 
     def test_diff_abundance_alpha0(self):
         # Test when we should get 0 features (setting FDR level to 0)