From ed9ef03a91ae82b8790edabd7734005d08b9a22a Mon Sep 17 00:00:00 2001 From: weiju Date: Mon, 11 Mar 2024 12:42:20 -0700 Subject: [PATCH] added a number of tests for preprocessing step --- miner/miner.py | 14 +++---- test/preprocess_test.py | 90 ++++++++++++++++++++--------------------- 2 files changed, 49 insertions(+), 55 deletions(-) diff --git a/miner/miner.py b/miner/miner.py index f71e360..e28adea 100644 --- a/miner/miner.py +++ b/miner/miner.py @@ -162,20 +162,16 @@ def save_response_content(response, destination): # Functions used for pre-processing data # ============================================================================= -def removeNullRows(df): - +def remove_null_rows(df): minimum = np.percentile(df,0) if minimum == 0: filteredDf = df.loc[df.sum(axis=1)>0,:] else: filteredDf = df - return filteredDf def convertToEnsembl(df,conversionTable,input_format=None): - from collections import Counter - # Index Conversion table on ENSG notation conversionTableEnsg = conversionTable.copy() conversionTableEnsg.index = conversionTableEnsg.iloc[:,0] @@ -554,7 +550,7 @@ def zscore(expressionData): print("completed z-transformation.") return transform -def correctBatchEffects(df, do_preprocess_tpm): +def correct_batch_effects(df, do_preprocess_tpm): zscoredExpression = zscore(df) means = [] @@ -573,8 +569,8 @@ def correctBatchEffects(df, do_preprocess_tpm): def preprocess(filename, mapfile, convert_ids=True, do_preprocess_tpm=True): rawExpression = readFileToDf(filename) - rawExpressionZeroFiltered = removeNullRows(rawExpression) - zscoredExpression = correctBatchEffects(rawExpressionZeroFiltered, do_preprocess_tpm) + rawExpressionZeroFiltered = remove_null_rows(rawExpression) + zscoredExpression = correct_batch_effects(rawExpressionZeroFiltered, do_preprocess_tpm) if convert_ids: expressionData, conversionTable = identifierConversion(zscoredExpression, mapfile) return expressionData, conversionTable @@ -582,7 +578,7 @@ def preprocess(filename, mapfile, convert_ids=True, do_preprocess_tpm=True): return zscoredExpression # ============================================================================= -# Functions used for clustering +# Functions used for clustering # ============================================================================= def pearson_array(array,vector): diff --git a/test/preprocess_test.py b/test/preprocess_test.py index 9c667ab..4ad88b9 100755 --- a/test/preprocess_test.py +++ b/test/preprocess_test.py @@ -3,65 +3,63 @@ import os import pytest -#import pandas as pd +import pandas as pd from miner import miner -""" -class PreprocessTest(unittest.TestCase): +def test_remove_null_rows_min_0_remove_ok(): + df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [0, 0, 0], [4, 5, 6]]) + df2 = miner.remove_null_rows(df) + assert 3 == df2.shape[0], "wrong number of rows" - def test_remove_null_rows_min_0_remove_ok(self): - df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [0, 0, 0], [4, 5, 6]]) - df2 = miner.remove_null_rows(df) - self.assertEqual(3, df2.shape[0], "wrong number of rows") +def test_remove_null_rows_min_0_unchanged(): + df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [1, 0, 1], [4, 5, 6]]) + df2 = miner.remove_null_rows(df) + assert 4 == df2.shape[0], "wrong number of rows" - def test_remove_null_rows_min_0_unchanged(self): - df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [1, 0, 1], [4, 5, 6]]) - df2 = miner.remove_null_rows(df) - self.assertEqual(4, df2.shape[0], "wrong number of rows") +def test_remove_null_rows_min_negative_unchanged(): + df = pd.DataFrame([[0, 1, -2], [1, 2, 3], [0, 0, 0], [4, 5, 6]]) + df2 = miner.remove_null_rows(df) + assert 4 == df2.shape[0], "wrong number of rows" - def test_remove_null_rows_min_negative_unchanged(self): - df = pd.DataFrame([[0, 1, -2], [1, 2, 3], [0, 0, 0], [4, 5, 6]]) - df2 = miner.remove_null_rows(df) - self.assertEqual(4, df2.shape[0], "wrong number of rows") +EPS = 0.001 +def test_correct_batch_effects_tpm(): + # large means to trigger the TPM function + df = pd.DataFrame([[4, 1, 2], [1, 2, 3], [4, 5, 6]]) + df2 = miner.correct_batch_effects(df, False) + assert (3, 3) == df2.shape + assert abs(df2.values[0, 0] - 1.0910894511799618) < EPS + assert abs(df2.values[1, 0] - (-1.0)) < EPS + assert abs(df2.values[2, 0] - (-1.0)) < EPS - def test_correct_batch_effects_tpm(self): - # large means to trigger the TPM function - df = pd.DataFrame([[4, 1, 2], [1, 2, 3], [4, 5, 6]]) - df2 = miner.correct_batch_effects(df, False) - self.assertEquals((3, 3), df2.shape) - self.assertAlmostEquals(df2.values[0, 0], 1.0910894511799618) - self.assertAlmostEquals(df2.values[1, 0], -1.0) - self.assertAlmostEquals(df2.values[2, 0], -1.0) + assert abs(df2.values[0, 1] - (-0.8728715609439697)) < EPS + assert abs(df2.values[1, 1] - 0.0) < EPS + assert abs(df2.values[2, 1] - 0.0) < EPS - self.assertAlmostEquals(df2.values[0, 1], -0.8728715609439697) - self.assertAlmostEquals(df2.values[1, 1], 0.0) - self.assertAlmostEquals(df2.values[2, 1], 0.0) + assert abs(df2.values[0, 2] - (-0.2182178902359925)) < EPS + assert abs(df2.values[1, 2] - 1.0) < EPS + assert abs(df2.values[2, 2] - 1.0) < EPS - self.assertAlmostEquals(df2.values[0, 2], -0.2182178902359925) - self.assertAlmostEquals(df2.values[1, 2], 1.0) - self.assertAlmostEquals(df2.values[2, 2], 1.0) - def test_correct_batch_effects_no_tpm(self): - # small means standard deviation - df = pd.DataFrame([[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]]) - df2 = miner.correct_batch_effects(df, False) - self.assertEquals((3, 3), df2.shape) - for i in range(3): - for j in range(3): - self.assertAlmostEquals(df2.values[i, j], -0.8164965809277261) +def test_correct_batch_effects_no_tpm(): + # small means standard deviation + df = pd.DataFrame([[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]]) + df2 = miner.correct_batch_effects(df, False) + assert (3, 3) == df2.shape + for i in range(3): + for j in range(3): + assert abs(df2.values[i, j] - (-0.8164965809277261)) < EPS - def test_preprocess_main_simple(self): - exp, conv_table = miner.preprocess('testdata/exp_data-001.csv', 'testdata/conv_table-001.tsv') - self.assertEquals((10, 3), exp.shape) - for i in range(3): - for j in range(3): - self.assertAlmostEquals(exp.values[i, j], -0.8164965809277261) -""" -def test_dummy(): - pass +""" +def test_preprocess_main_simple(): + exp, conv_table = miner.preprocess('testdata/exp_data-001.csv', 'testdata/conv_table-001.tsv') + assert (10, 3) == exp.shape + for i in range(3): + for j in range(3): + assert abs(exp.values[i, j] - (-0.8164965809277261)) < EPS +""" def test_has_testdir(): assert os.path.exists('miner_mindata')