Skip to content

Commit

Permalink
More systematic handling of missing values across functions.
Browse files Browse the repository at this point in the history
In particular, NumPy's masked constant is now considered to be missing,
along with the usual None constant.
  • Loading branch information
LTLA committed Nov 7, 2023
1 parent 47aba98 commit c7a1058
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 23 deletions.
8 changes: 5 additions & 3 deletions src/biocutils/Factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool =
Args:
codes:
Sequence of codes. Each value should be a non-negative integer
that refers to an entry ``levels``. Negative or None entries
are assumed to refer to missing values.
Sequence of codes. Each valid code should be a non-negative
integer that refers to an entry ``levels``. Codes may be
negative or correspond to a missing scalar (as defined by
:py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`),
in which case they are assumed to represent missing values.
levels:
List of levels containing unique strings.
Expand Down
24 changes: 14 additions & 10 deletions src/biocutils/intersect.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
from typing import Sequence

from .map_to_index import DUPLICATE_METHOD
from .is_missing_scalar import is_missing_scalar


def intersect(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> list:
"""Identify the intersection of values in multiple sequences, while preserving the order of values in the first
sequence.
"""
Identify the intersection of values in multiple sequences, while preserving
the order of values in the first sequence.
Args:
x (Sequence):
Zero, one or more sequences of interest.
x:
Zero, one or more sequences of interest containing hashable values.
We ignore missing values as defined by
:py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`.
duplicate_method (DUPLICATE_METHOD):
duplicate_method:
Whether to keep the first or last occurrence of duplicated values
when preserving order in the first sequence.
Returns:
list: Intersection of values across all ``x``. None values are ignored.
Intersection of values across all ``x``.
"""
nargs = len(x)
if nargs == 0:
Expand All @@ -29,7 +33,7 @@ def intersect(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> lis
output = []

def handler(f):
if f is not None and f not in present:
if not is_missing_scalar(f) and f not in present:
output.append(f)
present.add(f)

Expand All @@ -50,12 +54,12 @@ def handler(f):
# single sequence.
occurrences = {}
for f in first:
if f is not None and f not in occurrences:
if not is_missing_scalar(f) and f not in occurrences:
occurrences[f] = [1, 0]

for i in range(1, nargs):
for f in x[i]:
if f is not None and f in occurrences:
if not is_missing_scalar(f) and f in occurrences:
state = occurrences[f]
if state[1] < i:
state[0] += 1
Expand All @@ -65,7 +69,7 @@ def handler(f):
output = []

def handler(f):
if f is not None and f in occurrences:
if not is_missing_scalar(f) and f in occurrences:
state = occurrences[f]
if state[0] == nargs and state[1] >= 0:
output.append(f)
Expand Down
1 change: 1 addition & 0 deletions src/biocutils/is_missing_scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ def is_missing_scalar(x) -> bool:
Args:
x:
Any scalar value.
Returns:
Whether ``x`` is None or a NumPy masked constant.
"""
Expand Down
12 changes: 9 additions & 3 deletions src/biocutils/map_to_index.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
from typing import Literal, Sequence

from .is_missing_scalar import is_missing_scalar


DUPLICATE_METHOD = Literal["first", "last"]


def map_to_index(x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> dict:
"""Create a dictionary to map the values of a sequence to its positional indices.
"""
Create a dictionary to map values of a sequence to positional indices.
Args:
x (Sequence): Sequence of hashable values.
x:
Sequence of hashable values. We ignore missing values defined by
:py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`.
duplicate_method (DUPLICATE_METHOD): Whether to consider the first or
last occurrence of a duplicated value in ``x``.
Expand All @@ -19,7 +25,7 @@ def map_to_index(x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> d

mapping = {}
for i, val in enumerate(x):
if val is not None:
if not is_missing_scalar(val):
if not first_tie or val not in mapping:
mapping[val] = i

Expand Down
18 changes: 11 additions & 7 deletions src/biocutils/union.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
from typing import Sequence

from .map_to_index import DUPLICATE_METHOD
from .is_missing_scalar import is_missing_scalar


def union(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> list:
"""Identify the union of values in multiple sequences, while preserving the order of the first (or last) occurence
of each value.
"""
Identify the union of values in multiple sequences, while preserving the
order of the first (or last) occurence of each value.
Args:
x (Sequence):
Zero, one or more sequences of interest.
x:
Zero, one or more sequences of interest containing hashable values.
We ignore missing values as defined by
:py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`.
duplicate_method (DUPLICATE_METHOD):
duplicate_method:
Whether to take the first or last occurrence of each value in the
ordering of the output. If first, the first occurrence in the
earliest sequence of ``x`` is reported; if last, the last
occurrence in the latest sequence of ``x`` is reported.
Returns:
list: Union of values across all ``x``. None values are ignored.
Union of values across all ``x``.
"""

nargs = len(x)
Expand All @@ -29,7 +33,7 @@ def union(*x: Sequence, duplicate_method: DUPLICATE_METHOD = "first") -> list:
present = set()

def handler(f):
if f is not None and f not in present:
if not is_missing_scalar(f) and f not in present:
output.append(f)
present.add(f)

Expand Down

0 comments on commit c7a1058

Please sign in to comment.