Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dsfdr numpy fix #312

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# calour changelog

## Version 2024.12.23
Bug Fixes:
* fix problem arising from inexact numpy matrix multiplication that resulted in more stringent dsFDR than requested when using the default 'meandiff' method
## Version 2024.8.31
New features:
* add shuffler optional parameter and improved data validation for analysis.correlation()
Expand Down
2 changes: 1 addition & 1 deletion calour/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


__credits__ = "https://github.com/biocore/calour/graphs/contributors"
__version__ = "2024.8.31"
__version__ = "2024.12.23"

__all__ = ['read', 'read_amplicon', 'read_ms', 'read_qiime2',
'Experiment', 'AmpliconExperiment', 'MS1Experiment','mRNAExperiment',
Expand Down
61 changes: 42 additions & 19 deletions calour/dsfdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,30 +199,51 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
raise ValueError('transform type %s not supported' % transform_type)

numbact = np.shape(data)[0]

numsamples = np.shape(data)[1]
labels = labels.copy()

numbact = np.shape(data)[0]
labels = labels.copy()
if len(labels) != numsamples:
raise ValueError('number of labels (%d) and data samples (%d) do not match' % (len(labels), numsamples))

logger.debug('start permutation')
logger.debug('starting permutations')
if method == 'meandiff':
# fast matrix multiplication based calculation
method = meandiff
tstat = method(data, labels)
t = np.abs(tstat)
numsamples = np.shape(data)[1]
p = np.zeros([numsamples, numperm])
k1 = 1 / np.sum(labels == 0)
k2 = 1 / np.sum(labels == 1)
for cperm in range(numperm):
labels = shuffler(labels)
p[labels == 0, cperm] = k1
p2 = np.ones(p.shape) * k2
p2[p > 0] = 0
mean1 = np.dot(data, p)
mean2 = np.dot(data, p2)
u = np.abs(mean1 - mean2)

# the first column is the original data
m0 = np.zeros([numsamples, numperm+1])

k0 = 1 / np.sum(labels == 0)
k1 = 1 / np.sum(labels == 1)
for cperm in range(numperm+1):
# keep the first column as the original data
if cperm > 0:
rlabels = shuffler(labels)
else:
rlabels = labels
m0[rlabels == 0, cperm] = k0
m1 = np.ones(m0.shape) * k1
m1[m0 > 0] = 0
mean0 = np.dot(data, m0)
mean1 = np.dot(data, m1)
u = mean1 - mean0

tstat = u[:, 0]

# fix floating point error from martix multiplication for all entries
# problem is that mean for permutations that should have same mean, is very close but not identical (due to numpy matrix multiplication)
# important for permutation values since we use ranking and need to ignore close values
# error is of the order of 1E-9
# so we normalize by the max value in each row and then round to 9 digits
max_vals = np.max(np.abs(u), axis=1)
if np.any(max_vals == 0):
raise ValueError('data contains features with 0 values in all samples')
u = u / max_vals[:,np.newaxis]
u = np.round(u, 9)

u = np.abs(u)
t = u[:, 0]
u = u[:, 1:]

elif method == 'mannwhitney' or method == \
'kruwallis' or method == 'stdmeandiff':
if method == 'mannwhitney':
Expand Down Expand Up @@ -337,6 +358,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',

# calculate FDR
if fdr_method == 'dsfdr':
logger.debug('calculating dsFDR')
# sort unique p-values for original test statistics biggest to smallest
pvals_unique = np.unique(pvals)
sortp = pvals_unique[np.argsort(-pvals_unique)]
Expand All @@ -358,6 +380,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',

if not foundit:
# no good threshold was found
logger.debug('no good threshold was found')
reject = np.repeat([False], numbact)
return reject, tstat, pvals, qvals

Expand Down
7 changes: 7 additions & 0 deletions calour/tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import unittest

import numpy as np
import pandas.testing as pdt

import calour as ca
from calour.analysis import diff_abundance, diff_abundance_paired
Expand Down Expand Up @@ -46,6 +47,12 @@ def test_diff_abundance(self):
self.assertEqual(len(dd.feature_metadata), 7)
for cid in expected_ids:
self.assertIn(self.test1.feature_metadata.index[cid], dd.feature_metadata.index)

def test_diff_abundance_fast_vs_slow(self):
dd_fast = diff_abundance(self.test1, 'group', val1='1', val2='2', random_seed=2017, method='meandiff')
dd_slow = diff_abundance(self.test1, 'group', val1='1', val2='2', random_seed=2017, method=ca.dsfdr.meandiff)
# assert whether the XX.feature_metadata DataFrames as equal
pdt.assert_frame_equal(dd_fast.feature_metadata, dd_slow.feature_metadata,check_like=True,check_exact=False, atol=1e-2)

def test_diff_abundance_alpha0(self):
# Test when we should get 0 features (setting FDR level to 0)
Expand Down