Fleshed out sanity checks for classify/annotate functions.

This mostly involves checking that they behave as expected for feature subsets, given that feature manipulation is most of what the Python code does. We also set the seed for sanity checks to ensure that they won't stochastically fail.
SingleR-inc · Dec 15, 2024 · a54e0e6 · a54e0e6
1 parent 311846f
commit a54e0e6
Show file tree

Hide file tree

Showing 4 changed files with 130 additions and 2 deletions.
diff --git a/tests/test_annotate_integrated.py b/tests/test_annotate_integrated.py
@@ -1,8 +1,9 @@
 import singler
 import numpy
+import biocutils
 
 
-def test_annotate_integrated():
+def test_annotate_integrated_basic():
     all_features = [str(i) for i in range(10000)]
 
     ref1 = numpy.random.rand(8000, 10)
@@ -28,3 +29,54 @@ def test_annotate_integrated():
     assert set(single_results[0].column("best")) == set(labels1)
     assert set(single_results[1].column("best")) == set(labels2)
     assert set(integrated_results.column("best_reference")) == set([0, 1])
+
+
+def test_annotate_integrated_sanity():
+    numpy.random.seed(6969) # ensure we don't get surprised by different results.
+
+    ref1 = numpy.random.rand(1000, 10)
+    ref2 = numpy.random.rand(1000, 20)
+    all_features = ["GENE_" + str(i) for i in range(ref1.shape[0])]
+
+    ref1[0:100,1:5] = 0
+    ref1[200:300,6:10] = 0
+    ref2[100:200,1:10] = 0
+    ref2[200:300,11:20] = 0
+
+    labels1 = ["A"] * 5 + ["C"] * 5
+    labels2 = ["B"] * 10 + ["C"] * 10
+
+    test = numpy.random.rand(1000, 20)
+    test[0:100,0:20:2] = 0
+    test[100:200,1:20:2] = 0
+
+    single_results, integrated_results = singler.annotate_integrated(
+        test,
+        test_features=all_features,
+        ref_data=[ref1, ref2],
+        ref_labels=[labels1, labels2],
+        ref_features=[all_features, all_features],
+    )
+    assert integrated_results.column("best_label") == ["A", "B"] * 10
+    assert list(integrated_results.column("best_reference")) == [0, 1] * 10
+
+    # To mix it up a little, we're going to be taking every 2nd element of the
+    # ref1 and every 3rd element of ref2, just to make sure that the slicing
+    # works as expected.
+    rkeep1 = list(range(0, ref1.shape[0], 2))
+    rkeep2 = list(range(0, ref2.shape[0], 3))
+    single_results2, integrated_results2 = singler.annotate_integrated(
+        test,
+        test_features=all_features,
+        ref_data=[
+            ref1[rkeep1,:],
+            ref2[rkeep2,:]
+        ],
+        ref_features=[
+            biocutils.subset_sequence(all_features, rkeep1),
+            biocutils.subset_sequence(all_features, rkeep2)
+        ],
+        ref_labels=[labels1, labels2],
+    )
+    assert list(integrated_results2.column("best_reference")) == [0, 1] * 10
+    assert integrated_results2.column("best_label") == ["A", "B"] * 10
diff --git a/tests/test_annotate_single.py b/tests/test_annotate_single.py
@@ -1,8 +1,11 @@
 import singler
 import numpy
+import biocutils
 
 
 def test_annotate_single_sanity():
+    numpy.random.seed(123456) # ensure we don't get surprised by different results.
+
     ref = numpy.random.rand(10000, 10) + 1
     ref[:2000, :2] = 0
     ref[2000:4000, 2:4] = 0
@@ -30,6 +33,22 @@ def test_annotate_single_sanity():
     assert output.shape[0] == 5
     assert output.column("best") == ["B", "D", "A", "E", "C"]
 
+    # To mix it up a little, we're going to be taking every 2nd element of the
+    # ref and every 3rd element of the test, just to make sure that the slicing
+    # works as expected.
+    rkeep = list(range(0, ref.shape[0], 2))
+    tkeep = list(range(0, ref.shape[0], 3))
+    output2 = singler.annotate_single(
+        test[tkeep,:],
+        test_features=biocutils.subset_sequence(all_features, tkeep),
+        ref_data=ref[rkeep,:],
+        ref_features=biocutils.subset_sequence(all_features, rkeep),
+        ref_labels=labels,
+    )
+
+    assert output2.shape[0] == 5
+    assert output2.column("best") == ["B", "D", "A", "E", "C"]
+
 
 def test_annotate_single_intersect():
     ref = numpy.random.rand(10000, 10)

diff --git a/tests/test_classify_integrated.py b/tests/test_classify_integrated.py
@@ -2,7 +2,7 @@
 import numpy
 
 
-def test_classify_integrated():
+def test_classify_integrated_basic():
     all_features = [str(i) for i in range(10000)]
     test_features = [all_features[i] for i in range(0, 10000, 2)]
     test_set = set(test_features)
@@ -48,3 +48,53 @@ def test_classify_integrated():
             assert results1.column("best")[i] in labels1_set
         else:
             assert results2.column("best")[i] in labels2_set
+
+    # Same results in parallel.
+    presults = singler.classify_integrated(
+        test,
+        results=[results1, results2],
+        integrated_prebuilt=integrated,
+        num_threads = 2
+    )
+
+    assert presults.column("best_label") == results.column("best_label")
+    assert (presults.column("best_reference") == results.column("best_reference")).all()
+    assert (presults.column("delta") == results.column("delta")).all()
+
+
+def test_classify_integrated_sanity():
+    numpy.random.seed(42) # ensure we don't get surprised by different results.
+
+    ref1 = numpy.random.rand(1000, 10)
+    ref2 = numpy.random.rand(1000, 20)
+    all_features = ["GENE_" + str(i) for i in range(ref1.shape[0])]
+
+    ref1[0:100,1:5] = 0
+    ref1[200:300,6:10] = 0
+    ref2[100:200,1:10] = 0
+    ref2[200:300,11:20] = 0
+
+    lab1 = ["A"] * 5 + ["C"] * 5
+    lab2 = ["B"] * 10 + ["C"] * 10
+
+    test = numpy.random.rand(1000, 20)
+    test[0:100,0:20:2] = 0
+    test[100:200,1:20:2] = 0
+
+    train1 = singler.train_single(ref1, lab1, all_features)
+    pred1 = singler.classify_single(test, train1)
+    train2 = singler.train_single(ref2, lab2, all_features)
+    pred2 = singler.classify_single(test, train2)
+
+    integrated = singler.train_integrated(
+        all_features,
+        ref_prebuilt=[train1, train2],
+    )
+    results = singler.classify_integrated(
+        test,
+        results=[pred1, pred2],
+        integrated_prebuilt=integrated,
+    )
+
+    assert results.column("best_label") == ["A", "B"] * 10
+    assert list(results.column("best_reference")) == [0, 1] * 10
diff --git a/tests/test_classify_single.py b/tests/test_classify_single.py
@@ -18,8 +18,15 @@ def test_classify_single_simple():
     for x in output.column("best"):
         assert x in all_names
 
+    # Same results in parallel.
+    poutput = singler.classify_single(test, built, num_threads = 2)
+    assert output.column("best") == poutput.column("best")
+    assert (output.column("delta") == poutput.column("delta")).all()
+
 
 def test_classify_single_sanity():
+    numpy.random.seed(69)
+
     ref = numpy.random.rand(10000, 10) + 1
     ref[:2000, :2] = 0
     ref[2000:4000, 2:4] = 0