Skip to content

Commit

Permalink
Fleshed out sanity checks for classify/annotate functions.
Browse files Browse the repository at this point in the history
This mostly involves checking that they behave as expected for feature subsets,
given that feature manipulation is most of what the Python code does. We also
set the seed for sanity checks to ensure that they won't stochastically fail.
  • Loading branch information
LTLA committed Dec 15, 2024
1 parent 311846f commit a54e0e6
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 2 deletions.
54 changes: 53 additions & 1 deletion tests/test_annotate_integrated.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import singler
import numpy
import biocutils


def test_annotate_integrated():
def test_annotate_integrated_basic():
all_features = [str(i) for i in range(10000)]

ref1 = numpy.random.rand(8000, 10)
Expand All @@ -28,3 +29,54 @@ def test_annotate_integrated():
assert set(single_results[0].column("best")) == set(labels1)
assert set(single_results[1].column("best")) == set(labels2)
assert set(integrated_results.column("best_reference")) == set([0, 1])


def test_annotate_integrated_sanity():
numpy.random.seed(6969) # ensure we don't get surprised by different results.

ref1 = numpy.random.rand(1000, 10)
ref2 = numpy.random.rand(1000, 20)
all_features = ["GENE_" + str(i) for i in range(ref1.shape[0])]

ref1[0:100,1:5] = 0
ref1[200:300,6:10] = 0
ref2[100:200,1:10] = 0
ref2[200:300,11:20] = 0

labels1 = ["A"] * 5 + ["C"] * 5
labels2 = ["B"] * 10 + ["C"] * 10

test = numpy.random.rand(1000, 20)
test[0:100,0:20:2] = 0
test[100:200,1:20:2] = 0

single_results, integrated_results = singler.annotate_integrated(
test,
test_features=all_features,
ref_data=[ref1, ref2],
ref_labels=[labels1, labels2],
ref_features=[all_features, all_features],
)
assert integrated_results.column("best_label") == ["A", "B"] * 10
assert list(integrated_results.column("best_reference")) == [0, 1] * 10

# To mix it up a little, we're going to be taking every 2nd element of the
# ref1 and every 3rd element of ref2, just to make sure that the slicing
# works as expected.
rkeep1 = list(range(0, ref1.shape[0], 2))
rkeep2 = list(range(0, ref2.shape[0], 3))
single_results2, integrated_results2 = singler.annotate_integrated(
test,
test_features=all_features,
ref_data=[
ref1[rkeep1,:],
ref2[rkeep2,:]
],
ref_features=[
biocutils.subset_sequence(all_features, rkeep1),
biocutils.subset_sequence(all_features, rkeep2)
],
ref_labels=[labels1, labels2],
)
assert list(integrated_results2.column("best_reference")) == [0, 1] * 10
assert integrated_results2.column("best_label") == ["A", "B"] * 10
19 changes: 19 additions & 0 deletions tests/test_annotate_single.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import singler
import numpy
import biocutils


def test_annotate_single_sanity():
numpy.random.seed(123456) # ensure we don't get surprised by different results.

ref = numpy.random.rand(10000, 10) + 1
ref[:2000, :2] = 0
ref[2000:4000, 2:4] = 0
Expand Down Expand Up @@ -30,6 +33,22 @@ def test_annotate_single_sanity():
assert output.shape[0] == 5
assert output.column("best") == ["B", "D", "A", "E", "C"]

# To mix it up a little, we're going to be taking every 2nd element of the
# ref and every 3rd element of the test, just to make sure that the slicing
# works as expected.
rkeep = list(range(0, ref.shape[0], 2))
tkeep = list(range(0, ref.shape[0], 3))
output2 = singler.annotate_single(
test[tkeep,:],
test_features=biocutils.subset_sequence(all_features, tkeep),
ref_data=ref[rkeep,:],
ref_features=biocutils.subset_sequence(all_features, rkeep),
ref_labels=labels,
)

assert output2.shape[0] == 5
assert output2.column("best") == ["B", "D", "A", "E", "C"]


def test_annotate_single_intersect():
ref = numpy.random.rand(10000, 10)
Expand Down
52 changes: 51 additions & 1 deletion tests/test_classify_integrated.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy


def test_classify_integrated():
def test_classify_integrated_basic():
all_features = [str(i) for i in range(10000)]
test_features = [all_features[i] for i in range(0, 10000, 2)]
test_set = set(test_features)
Expand Down Expand Up @@ -48,3 +48,53 @@ def test_classify_integrated():
assert results1.column("best")[i] in labels1_set
else:
assert results2.column("best")[i] in labels2_set

# Same results in parallel.
presults = singler.classify_integrated(
test,
results=[results1, results2],
integrated_prebuilt=integrated,
num_threads = 2
)

assert presults.column("best_label") == results.column("best_label")
assert (presults.column("best_reference") == results.column("best_reference")).all()
assert (presults.column("delta") == results.column("delta")).all()


def test_classify_integrated_sanity():
numpy.random.seed(42) # ensure we don't get surprised by different results.

ref1 = numpy.random.rand(1000, 10)
ref2 = numpy.random.rand(1000, 20)
all_features = ["GENE_" + str(i) for i in range(ref1.shape[0])]

ref1[0:100,1:5] = 0
ref1[200:300,6:10] = 0
ref2[100:200,1:10] = 0
ref2[200:300,11:20] = 0

lab1 = ["A"] * 5 + ["C"] * 5
lab2 = ["B"] * 10 + ["C"] * 10

test = numpy.random.rand(1000, 20)
test[0:100,0:20:2] = 0
test[100:200,1:20:2] = 0

train1 = singler.train_single(ref1, lab1, all_features)
pred1 = singler.classify_single(test, train1)
train2 = singler.train_single(ref2, lab2, all_features)
pred2 = singler.classify_single(test, train2)

integrated = singler.train_integrated(
all_features,
ref_prebuilt=[train1, train2],
)
results = singler.classify_integrated(
test,
results=[pred1, pred2],
integrated_prebuilt=integrated,
)

assert results.column("best_label") == ["A", "B"] * 10
assert list(results.column("best_reference")) == [0, 1] * 10
7 changes: 7 additions & 0 deletions tests/test_classify_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,15 @@ def test_classify_single_simple():
for x in output.column("best"):
assert x in all_names

# Same results in parallel.
poutput = singler.classify_single(test, built, num_threads = 2)
assert output.column("best") == poutput.column("best")
assert (output.column("delta") == poutput.column("delta")).all()


def test_classify_single_sanity():
numpy.random.seed(69)

ref = numpy.random.rand(10000, 10) + 1
ref[:2000, :2] = 0
ref[2000:4000, 2:4] = 0
Expand Down

0 comments on commit a54e0e6

Please sign in to comment.