From c7a1058977a1fa599cbfd602476af8d8373fa055 Mon Sep 17 00:00:00 2001 From: LTLA Date: Tue, 7 Nov 2023 13:15:34 -0800 Subject: [PATCH] More systematic handling of missing values across functions. In particular, NumPy's masked constant is now considered to be missing, along with the usual None constant. --- src/biocutils/Factor.py | 8 +++++--- src/biocutils/intersect.py | 24 ++++++++++++++---------- src/biocutils/is_missing_scalar.py | 1 + src/biocutils/map_to_index.py | 12 +++++++++--- src/biocutils/union.py | 18 +++++++++++------- 5 files changed, 40 insertions(+), 23 deletions(-) diff --git a/src/biocutils/Factor.py b/src/biocutils/Factor.py index d75c252..3c93759 100644 --- a/src/biocutils/Factor.py +++ b/src/biocutils/Factor.py @@ -32,9 +32,11 @@ def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool = Args: codes: - Sequence of codes. Each value should be a non-negative integer - that refers to an entry ``levels``. Negative or None entries - are assumed to refer to missing values. + Sequence of codes. Each valid code should be a non-negative + integer that refers to an entry ``levels``. Codes may be + negative or correspond to a missing scalar (as defined by + :py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`), + in which case they are assumed to represent missing values. levels: List of levels containing unique strings. diff --git a/src/biocutils/intersect.py b/src/biocutils/intersect.py index aeb99e6..5de6c6e 100644 --- a/src/biocutils/intersect.py +++ b/src/biocutils/intersect.py @@ -1,22 +1,26 @@ from typing import Sequence from .map_to_index import DUPLICATE_METHOD +from .is_missing_scalar import is_missing_scalar def intersect(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> list: - """Identify the intersection of values in multiple sequences, while preserving the order of values in the first - sequence. + """ + Identify the intersection of values in multiple sequences, while preserving + the order of values in the first sequence. Args: - x (Sequence): - Zero, one or more sequences of interest. + x: + Zero, one or more sequences of interest containing hashable values. + We ignore missing values as defined by + :py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`. - duplicate_method (DUPLICATE_METHOD): + duplicate_method: Whether to keep the first or last occurrence of duplicated values when preserving order in the first sequence. Returns: - list: Intersection of values across all ``x``. None values are ignored. + Intersection of values across all ``x``. """ nargs = len(x) if nargs == 0: @@ -29,7 +33,7 @@ def intersect(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> lis output = [] def handler(f): - if f is not None and f not in present: + if not is_missing_scalar(f) and f not in present: output.append(f) present.add(f) @@ -50,12 +54,12 @@ def handler(f): # single sequence. occurrences = {} for f in first: - if f is not None and f not in occurrences: + if not is_missing_scalar(f) and f not in occurrences: occurrences[f] = [1, 0] for i in range(1, nargs): for f in x[i]: - if f is not None and f in occurrences: + if not is_missing_scalar(f) and f in occurrences: state = occurrences[f] if state[1] < i: state[0] += 1 @@ -65,7 +69,7 @@ def handler(f): output = [] def handler(f): - if f is not None and f in occurrences: + if not is_missing_scalar(f) and f in occurrences: state = occurrences[f] if state[0] == nargs and state[1] >= 0: output.append(f) diff --git a/src/biocutils/is_missing_scalar.py b/src/biocutils/is_missing_scalar.py index ea68aec..8392fdf 100644 --- a/src/biocutils/is_missing_scalar.py +++ b/src/biocutils/is_missing_scalar.py @@ -6,6 +6,7 @@ def is_missing_scalar(x) -> bool: Args: x: Any scalar value. + Returns: Whether ``x`` is None or a NumPy masked constant. """ diff --git a/src/biocutils/map_to_index.py b/src/biocutils/map_to_index.py index 3355d71..2611019 100644 --- a/src/biocutils/map_to_index.py +++ b/src/biocutils/map_to_index.py @@ -1,13 +1,19 @@ from typing import Literal, Sequence +from .is_missing_scalar import is_missing_scalar + + DUPLICATE_METHOD = Literal["first", "last"] def map_to_index(x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> dict: - """Create a dictionary to map the values of a sequence to its positional indices. + """ + Create a dictionary to map values of a sequence to positional indices. Args: - x (Sequence): Sequence of hashable values. + x: + Sequence of hashable values. We ignore missing values defined by + :py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`. duplicate_method (DUPLICATE_METHOD): Whether to consider the first or last occurrence of a duplicated value in ``x``. @@ -19,7 +25,7 @@ def map_to_index(x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> d mapping = {} for i, val in enumerate(x): - if val is not None: + if not is_missing_scalar(val): if not first_tie or val not in mapping: mapping[val] = i diff --git a/src/biocutils/union.py b/src/biocutils/union.py index 5687582..af47778 100644 --- a/src/biocutils/union.py +++ b/src/biocutils/union.py @@ -1,24 +1,28 @@ from typing import Sequence from .map_to_index import DUPLICATE_METHOD +from .is_missing_scalar import is_missing_scalar def union(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> list: - """Identify the union of values in multiple sequences, while preserving the order of the first (or last) occurence - of each value. + """ + Identify the union of values in multiple sequences, while preserving the + order of the first (or last) occurence of each value. Args: - x (Sequence): - Zero, one or more sequences of interest. + x: + Zero, one or more sequences of interest containing hashable values. + We ignore missing values as defined by + :py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`. - duplicate_method (DUPLICATE_METHOD): + duplicate_method: Whether to take the first or last occurrence of each value in the ordering of the output. If first, the first occurrence in the earliest sequence of ``x`` is reported; if last, the last occurrence in the latest sequence of ``x`` is reported. Returns: - list: Union of values across all ``x``. None values are ignored. + Union of values across all ``x``. """ nargs = len(x) @@ -29,7 +33,7 @@ def union(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> list: present = set() def handler(f): - if f is not None and f not in present: + if not is_missing_scalar(f) and f not in present: output.append(f) present.add(f)