From 6c8d32ed60a72234173334121ce5f4f45554de2f Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 14 Aug 2024 13:23:35 -0400 Subject: [PATCH] Use an appropriate NumPy dtype based on the number of unique sequences (#118) Add tests to check the `dtype` assignment. --- src/genomicranges/GenomicRanges.py | 12 +++++++--- tests/test_gr_init_seqnames.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 tests/test_gr_init_seqnames.py diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py index 5023a59..9247436 100644 --- a/src/genomicranges/GenomicRanges.py +++ b/src/genomicranges/GenomicRanges.py @@ -220,9 +220,15 @@ def _sanitize_seqnames(self, seqnames, seqinfo): self._build_reverse_seqindex(seqinfo) if not isinstance(seqnames, np.ndarray): - seqnames = np.asarray( - [self._reverse_seqindex[x] for x in seqnames], dtype=np.int8 - ) + seqnames = np.asarray([self._reverse_seqindex[x] for x in seqnames]) + + num_uniq = len(np.unique(seqnames)) + if num_uniq < 2**8: + seqnames = seqnames.astype(np.int8) + elif num_uniq < 2**16: + seqnames = seqnames.astype(np.int16) + elif num_uniq < 2**32: + seqnames = seqnames.astype(np.int32) return seqnames diff --git a/tests/test_gr_init_seqnames.py b/tests/test_gr_init_seqnames.py new file mode 100644 index 0000000..18a8086 --- /dev/null +++ b/tests/test_gr_init_seqnames.py @@ -0,0 +1,37 @@ +import pytest +from genomicranges import GenomicRanges +from iranges import IRanges +from biocframe import BiocFrame +from random import random +import pandas as pd +import numpy as np + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_create_gr(): + gr = GenomicRanges( + seqnames=["chr1"] * 10, + ranges=IRanges(start=range(100, 110), width=range(110, 120)), + ) + + assert gr is not None + assert gr._seqnames.dtype == np.int8 + + gr16 = GenomicRanges( + seqnames=[f"chr{i}" for i in range(500)], + ranges=IRanges(start=range(0, 500), width=range(10, 510)), + ) + + assert gr16 is not None + assert gr16._seqnames.dtype == np.int16 + + gr32 = GenomicRanges( + seqnames=[f"chr{i}" for i in range(2**16 + 1)], + ranges=IRanges(start=range(0, 2**16 + 1), width=range(10, 2**16 + 11)), + ) + + assert gr32 is not None + assert gr32._seqnames.dtype == np.int32