Skip to content

Commit

Permalink
Add support for numpy2
Browse files Browse the repository at this point in the history
  • Loading branch information
kounelisagis committed May 16, 2024
1 parent 19bc046 commit 75a9969
Show file tree
Hide file tree
Showing 11 changed files with 76 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ repos:
hooks:
- id: black
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.0.284
rev: v0.4.4
hooks:
- id: ruff
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,8 @@ extend-select = ["I001"]
extend-exclude = ["doc"]
fix = true

[tool.ruff.lint]
# select = ["NPY201"] Enabling it will cause all types to be replaced by numpy2

[tool.ruff.per-file-ignores]
"tiledb/__init__.py" = ["F401"]
12 changes: 2 additions & 10 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
from ctypes import CDLL, POINTER, Structure, byref, c_char_p, c_int, c_void_p

from pkg_resources import resource_filename
import numpy as np
from pybind11.setup_helpers import Pybind11Extension
from setuptools import Extension, find_packages, setup

Expand Down Expand Up @@ -478,16 +478,8 @@ class build_ext(cython_build_ext):
"""

def build_extensions(self):
"""
Lazily append numpy's include directory to Extension includes.
This is done here rather than at module scope because setup.py
may be run before numpy has been installed, in which case
importing numpy and calling `numpy.get_include()` will fail.
"""
numpy_incl = resource_filename("numpy", "core/include")
for ext in self.extensions:
ext.include_dirs.append(numpy_incl)
ext.include_dirs.append(np.get_include())

find_or_install_libtiledb(self)

Expand Down
1 change: 1 addition & 0 deletions tiledb/common.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ from libc.stdio cimport FILE, stdout
from libc.stdlib cimport calloc, free, malloc
from libc.string cimport memcpy
from libcpp.vector cimport vector
from libcpp cimport bool as bool_t


cdef extern from "Python.h":
Expand Down
8 changes: 7 additions & 1 deletion tiledb/highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,13 @@ def is_ndarray_like(arr):
elif shape and dtype:
if np.issubdtype(np.bytes_, dtype):
dtype = np.dtype("S")
elif np.issubdtype(dtype, np.unicode_):
elif (
np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"
and np.issubdtype(dtype, np.str_)
) or (
np.lib.NumpyVersion(np.__version__) < "2.0.0b1"
and np.issubdtype(dtype, np.unicode_)
):
dtype = np.dtype("U")

ndim = len(shape)
Expand Down
48 changes: 30 additions & 18 deletions tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ import io
import warnings
import collections.abc
from collections import OrderedDict
from json import dumps as json_dumps
from json import loads as json_loads
from json import dumps as json_dumps, loads as json_loads

from ._generated_version import version_tuple as tiledbpy_version
from .array_schema import ArraySchema
Expand All @@ -28,6 +27,7 @@ from .vfs import VFS

# https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.import_array
np.import_array()
np.set_printoptions(legacy='1.21') # use unified numpy printing

###############################################################################
# Utility/setup #
Expand All @@ -36,10 +36,9 @@ np.import_array()
# Integer types supported by Python / System
_inttypes = (int, np.integer)

# Numpy initialization code (critical)
# https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.import_array
np.import_array()

cdef bool_t has_numpy2():
return np.lib.NumpyVersion(np.__version__) >= '2.0.0b1'

cdef tiledb_ctx_t* safe_ctx_ptr(object ctx):
if ctx is None:
Expand Down Expand Up @@ -145,9 +144,11 @@ cdef _write_array(
if attr.isvar:
try:
if attr.isnullable:
if(np.issubdtype(attr.dtype, np.unicode_)
or np.issubdtype(attr.dtype, np.string_)
or np.issubdtype(attr.dtype, np.bytes_)):
if (
np.issubdtype(attr.dtype, np.bytes_) or
has_numpy2() and np.issubdtype(attr.dtype, np.str_) or
not has_numpy2() and (np.issubdtype(attr.dtype, np.unicode_) or np.issubdtype(attr.dtype, np.string_))
):
attr_val = np.array(["" if v is None else v for v in values[i]])
else:
attr_val = np.nan_to_num(values[i])
Expand Down Expand Up @@ -601,7 +602,13 @@ def index_domain_subarray(array: Array, dom, idx: tuple):
dim = dom.dim(r)
dim_dtype = dim.dtype

if array.mode == 'r' and (np.issubdtype(dim_dtype, np.unicode_) or np.issubdtype(dim_dtype, np.bytes_)):
if array.mode == 'r' and (
(has_numpy2() and np.issubdtype(dim_dtype, np.str_))
or
(not has_numpy2() and np.issubdtype(dim_dtype, np.unicode_))
or
np.issubdtype(dim_dtype, np.bytes_)
):
# NED can only be retrieved in read mode
ned = array.nonempty_domain()
(dim_lb, dim_ub) = ned[r] if ned else (None, None)
Expand All @@ -612,7 +619,11 @@ def index_domain_subarray(array: Array, dom, idx: tuple):
if not isinstance(dim_slice, slice):
raise IndexError("invalid index type: {!r}".format(type(dim_slice)))

# numpy2 doesn't allow addition beween int and np.int64
start, stop, step = dim_slice.start, dim_slice.stop, dim_slice.step
start = np.int64(start) if isinstance(start, int) else start
stop = np.int64(stop) if isinstance(stop, int) else stop
step = np.int64(step) if isinstance(step, int) else step

if np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_):
if start is None or stop is None:
Expand Down Expand Up @@ -1489,7 +1500,8 @@ cdef class Array(object):

cdef _ndarray_is_varlen(self, np.ndarray array):
return (np.issubdtype(array.dtype, np.bytes_) or
np.issubdtype(array.dtype, np.unicode_) or
(has_numpy2() and np.issubdtype(array.dtype, np.str_)) or
(not has_numpy2() and np.issubdtype(array.dtype, np.unicode_)) or
array.dtype == object)

@property
Expand Down Expand Up @@ -2503,8 +2515,8 @@ cdef class DenseArrayImpl(Array):
dtype=np.uint8
)
else:
if (np.issubdtype(attr.dtype, np.string_) and not
(np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) and not
(np.issubdtype(attr_val.dtype, np.bytes_ if has_numpy2() else np.string_) or attr_val.dtype == np.dtype('O'))):
raise ValueError("Cannot write a string value to non-string "
"typed attribute '{}'!".format(name))

Expand All @@ -2518,7 +2530,7 @@ cdef class DenseArrayImpl(Array):
dtype=np.uint8
)

if np.issubdtype(attr.dtype, np.string_):
if np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_):
attr_val = np.array(
["" if v is None else v for v in attr_val])
else:
Expand Down Expand Up @@ -2552,8 +2564,8 @@ cdef class DenseArrayImpl(Array):
if attr.isnullable and name not in nullmaps:
nullmaps[name] = np.array([int(v is None) for v in val], dtype=np.uint8)
else:
if (np.issubdtype(attr.dtype, np.string_) and not
(np.issubdtype(val.dtype, np.string_) or val.dtype == np.dtype('O'))):
if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) and not
(np.issubdtype(val.dtype, np.bytes_ if has_numpy2() else np.string_) or val.dtype == np.dtype('O'))):
raise ValueError("Cannot write a string value to non-string "
"typed attribute '{}'!".format(name))

Expand Down Expand Up @@ -3040,8 +3052,8 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
nullmaps[name] = np.array(
[int(v is not None) for v in attr_val], dtype=np.uint8)
else:
if (np.issubdtype(attr.dtype, np.string_)
and not (np.issubdtype(attr_val.dtype, np.string_)
if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_)
and not (np.issubdtype(attr_val.dtype, np.bytes_ if has_numpy2() else np.string_)
or attr_val.dtype == np.dtype('O'))):
raise ValueError("Cannot write a string value to non-string "
"typed attribute '{}'!".format(name))
Expand All @@ -3053,7 +3065,7 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
nullmaps[name] = np.array(
[int(v is not None) for v in attr_val], dtype=np.uint8)

if np.issubdtype(attr.dtype, np.string_):
if np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_):
attr_val = np.array(["" if v is None else v for v in attr_val])
else:
attr_val = np.nan_to_num(attr_val)
Expand Down
8 changes: 7 additions & 1 deletion tiledb/multirange_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,13 @@ def __init__(
# Until list attributes are supported in core, error with a clear message.
if use_arrow and any(
(attr.isvar or len(attr.dtype) > 1)
and attr.dtype not in (np.unicode_, np.bytes_)
and attr.dtype
not in (
np.str_
if np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"
else np.unicode_,
np.bytes_,
)
for attr in map(array.attr, query.attrs or ())
):
raise TileDBError(
Expand Down
4 changes: 4 additions & 0 deletions tiledb/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
)


def has_numpy2():
return np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"


def has_pandas():
return importlib.util.find_spec("pandas") is not None

Expand Down
4 changes: 2 additions & 2 deletions tiledb/tests/test_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import tiledb

from .common import DiskTestCase, assert_captured, has_pandas
from .common import DiskTestCase, assert_captured, has_numpy2, has_pandas


class AttributeTest(DiskTestCase):
Expand All @@ -16,7 +16,7 @@ def test_minimal_attribute(self):
self.assertEqual(attr, attr)
self.assertTrue(attr.isanon)
self.assertEqual(attr.name, "")
self.assertEqual(attr.dtype, np.float_)
self.assertEqual(attr.dtype, np.float64 if has_numpy2() else np.float_)
self.assertFalse(attr.isvar)
self.assertFalse(attr.isnullable)

Expand Down
25 changes: 14 additions & 11 deletions tiledb/tests/test_libtiledb.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
assert_subarrays_equal,
assert_unordered_equal,
fx_sparse_cell_order, # noqa: F401
has_numpy2,
has_pandas,
has_pyarrow,
rand_ascii,
Expand Down Expand Up @@ -1230,10 +1231,12 @@ def test_reopen_dense_array(self, use_timestamps):

def test_data_begins_with_null_chars(self):
path = self.path("test_data_begins_with_null_chars")
data = np.array(["", "", "", "a", "", "", "", "", "", "b"], dtype=np.unicode_)

data = np.array(
["", "", "", "a", "", "", "", "", "", "b"],
dtype=np.str_ if has_numpy2() else np.unicode_,
)
dom = tiledb.Domain(tiledb.Dim(domain=(1, len(data)), tile=len(data)))
att = tiledb.Attr(dtype=np.unicode_, var=True)
att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_)
schema = tiledb.ArraySchema(dom, (att,))
tiledb.Array.create(path, schema)

Expand Down Expand Up @@ -1325,12 +1328,12 @@ def test_varlen_write_unicode(self):
"",
"hhhhhhhhhh",
],
dtype=np.unicode_,
dtype=np.str_ if has_numpy2() else np.unicode_,
)

# basic write
dom = tiledb.Domain(tiledb.Dim(domain=(1, len(A)), tile=len(A)))
att = tiledb.Attr(dtype=np.unicode_, var=True)
att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_, var=True)

schema = tiledb.ArraySchema(dom, (att,))

Expand Down Expand Up @@ -1487,7 +1490,7 @@ def test_varlen_write_fixedunicode(self):

# basic write
dom = tiledb.Domain(tiledb.Dim(domain=(1, len(A)), tile=len(A)))
att = tiledb.Attr(dtype=np.unicode_)
att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_)

schema = tiledb.ArraySchema(dom, (att,))

Expand Down Expand Up @@ -1991,7 +1994,7 @@ def test_sparse_bytes(self, fx_sparse_cell_order):

def test_sparse_unicode(self, fx_sparse_cell_order):
dom = tiledb.Domain(tiledb.Dim("x", domain=(1, 10000), tile=100, dtype=int))
att = tiledb.Attr("", var=True, dtype=np.unicode_)
att = tiledb.Attr("", var=True, dtype=np.str_ if has_numpy2() else np.unicode_)
schema = tiledb.ArraySchema(
domain=dom, attrs=(att,), sparse=True, cell_order=fx_sparse_cell_order
)
Expand Down Expand Up @@ -3514,11 +3517,11 @@ def test_incomplete_dense_varlen(self, non_overlapping_ranges):
ncells = 10
path = self.path("incomplete_dense_varlen")
str_data = [rand_utf8(random.randint(0, n)) for n in range(ncells)]
data = np.array(str_data, dtype=np.unicode_)
data = np.array(str_data, dtype=np.str_)

# basic write
dom = tiledb.Domain(tiledb.Dim(domain=(1, len(data)), tile=len(data)))
att = tiledb.Attr(dtype=np.unicode_, var=True)
att = tiledb.Attr(dtype=np.str_, var=True)

schema = tiledb.ArraySchema(dom, (att,))

Expand Down Expand Up @@ -3556,12 +3559,12 @@ def test_incomplete_sparse_varlen(self, allows_duplicates, non_overlapping_range

path = self.path("incomplete_sparse_varlen")
str_data = [rand_utf8(random.randint(0, n)) for n in range(ncells)]
data = np.array(str_data, dtype=np.unicode_)
data = np.array(str_data, dtype=np.str_ if has_numpy2() else np.unicode_)
coords = np.arange(ncells)

# basic write
dom = tiledb.Domain(tiledb.Dim(domain=(0, len(data) + 100), tile=len(data)))
att = tiledb.Attr(dtype=np.unicode_, var=True)
att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_, var=True)

schema = tiledb.ArraySchema(
dom, (att,), sparse=True, allows_duplicates=allows_duplicates
Expand Down
7 changes: 5 additions & 2 deletions tiledb/tests/test_pandas_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .common import (
DiskTestCase,
# has_numpy2,
dtype_max,
dtype_min,
rand_ascii,
Expand Down Expand Up @@ -1324,8 +1325,10 @@ def test_incomplete_df(self, allows_duplicates, non_overlapping_ranges):
data[validity_idx] = None

# TODO - not supported
# str_data = np.array([rand_utf8(random.randint(0, n)) for n in range(ncells)],
# dtype=np.unicode_)
# str_data = np.array(
# [rand_utf8(random.randint(0, n)) for n in range(ncells)],
# dtype=np.unicode_ if has_numpy2() else np.str_,
# )
# str_data[validity_idx] = None

df = pd.DataFrame({"int64": pd.Series(data, dtype=pd.Int64Dtype())})
Expand Down

0 comments on commit 75a9969

Please sign in to comment.