Add support for numpy2

TileDB-Inc · May 16, 2024 · 75a9969 · 75a9969
1 parent 19bc046
commit 75a9969
Show file tree

Hide file tree

Showing 11 changed files with 76 additions and 46 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,6 +4,6 @@ repos:
     hooks:
     - id: black
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.284
+    rev: v0.4.4
     hooks:
     - id: ruff
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,5 +72,8 @@ extend-select = ["I001"]
 extend-exclude = ["doc"]
 fix = true
 
+[tool.ruff.lint]
+# select = ["NPY201"]  Enabling it will cause all types to be replaced by numpy2
+
 [tool.ruff.per-file-ignores]
 "tiledb/__init__.py" = ["F401"]
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 import sys
 from ctypes import CDLL, POINTER, Structure, byref, c_char_p, c_int, c_void_p
 
-from pkg_resources import resource_filename
+import numpy as np
 from pybind11.setup_helpers import Pybind11Extension
 from setuptools import Extension, find_packages, setup
 
@@ -478,16 +478,8 @@ class build_ext(cython_build_ext):
             """
 
             def build_extensions(self):
-                """
-                Lazily append numpy's include directory to Extension includes.
-
-                This is done here rather than at module scope because setup.py
-                may be run before numpy has been installed, in which case
-                importing numpy and calling `numpy.get_include()` will fail.
-                """
-                numpy_incl = resource_filename("numpy", "core/include")
                 for ext in self.extensions:
-                    ext.include_dirs.append(numpy_incl)
+                    ext.include_dirs.append(np.get_include())
 
                 find_or_install_libtiledb(self)
 

diff --git a/tiledb/common.pxi b/tiledb/common.pxi
@@ -25,6 +25,7 @@ from libc.stdio cimport FILE, stdout
 from libc.stdlib cimport calloc, free, malloc
 from libc.string cimport memcpy
 from libcpp.vector cimport vector
+from libcpp cimport bool as bool_t
 
 
 cdef extern from "Python.h":

diff --git a/tiledb/highlevel.py b/tiledb/highlevel.py
@@ -224,7 +224,13 @@ def is_ndarray_like(arr):
     elif shape and dtype:
         if np.issubdtype(np.bytes_, dtype):
             dtype = np.dtype("S")
-        elif np.issubdtype(dtype, np.unicode_):
+        elif (
+            np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"
+            and np.issubdtype(dtype, np.str_)
+        ) or (
+            np.lib.NumpyVersion(np.__version__) < "2.0.0b1"
+            and np.issubdtype(dtype, np.unicode_)
+        ):
             dtype = np.dtype("U")
 
         ndim = len(shape)

diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx
@@ -12,8 +12,7 @@ import io
 import warnings
 import collections.abc
 from collections import OrderedDict
-from json import dumps as json_dumps
-from json import loads as json_loads
+from json import dumps as json_dumps, loads as json_loads
 
 from ._generated_version import version_tuple as tiledbpy_version
 from .array_schema import ArraySchema
@@ -28,6 +27,7 @@ from .vfs import VFS
 
 # https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.import_array
 np.import_array()
+np.set_printoptions(legacy='1.21')  # use unified numpy printing
 
 ###############################################################################
 #    Utility/setup                                                            #
@@ -36,10 +36,9 @@ np.import_array()
 # Integer types supported by Python / System
 _inttypes = (int, np.integer)
 
-# Numpy initialization code (critical)
-# https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.import_array
-np.import_array()
 
+cdef bool_t has_numpy2():
+    return np.lib.NumpyVersion(np.__version__) >= '2.0.0b1'
 
 cdef tiledb_ctx_t* safe_ctx_ptr(object ctx):
     if ctx is None:
@@ -145,9 +144,11 @@ cdef _write_array(
         if attr.isvar:
             try:
                 if attr.isnullable:
-                    if(np.issubdtype(attr.dtype, np.unicode_) 
-                        or np.issubdtype(attr.dtype, np.string_) 
-                        or np.issubdtype(attr.dtype, np.bytes_)):
+                    if (
+                        np.issubdtype(attr.dtype, np.bytes_) or
+                        has_numpy2() and np.issubdtype(attr.dtype, np.str_) or
+                        not has_numpy2() and (np.issubdtype(attr.dtype, np.unicode_) or np.issubdtype(attr.dtype, np.string_))
+                    ):
                         attr_val = np.array(["" if v is None else v for v in values[i]])
                     else:
                         attr_val = np.nan_to_num(values[i])
@@ -601,7 +602,13 @@ def index_domain_subarray(array: Array, dom, idx: tuple):
         dim = dom.dim(r)
         dim_dtype = dim.dtype
 
-        if array.mode == 'r' and (np.issubdtype(dim_dtype, np.unicode_) or np.issubdtype(dim_dtype, np.bytes_)):
+        if array.mode == 'r' and (       
+            (has_numpy2() and np.issubdtype(dim_dtype, np.str_))
+            or
+            (not has_numpy2() and np.issubdtype(dim_dtype, np.unicode_))
+            or
+            np.issubdtype(dim_dtype, np.bytes_)
+       ):
             # NED can only be retrieved in read mode
             ned = array.nonempty_domain()
             (dim_lb, dim_ub) = ned[r] if ned else (None, None)
@@ -612,7 +619,11 @@ def index_domain_subarray(array: Array, dom, idx: tuple):
         if not isinstance(dim_slice, slice):
             raise IndexError("invalid index type: {!r}".format(type(dim_slice)))
 
+        # numpy2 doesn't allow addition beween int and np.int64
         start, stop, step = dim_slice.start, dim_slice.stop, dim_slice.step
+        start = np.int64(start) if isinstance(start, int) else start
+        stop = np.int64(stop) if isinstance(stop, int) else stop
+        step = np.int64(step) if isinstance(step, int) else step
 
         if np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_):
             if start is None or stop is None:
@@ -1489,7 +1500,8 @@ cdef class Array(object):
 
     cdef _ndarray_is_varlen(self, np.ndarray array):
         return  (np.issubdtype(array.dtype, np.bytes_) or
-                 np.issubdtype(array.dtype, np.unicode_) or
+                 (has_numpy2() and np.issubdtype(array.dtype, np.str_)) or
+                 (not has_numpy2() and np.issubdtype(array.dtype, np.unicode_)) or
                  array.dtype == object)
 
     @property
@@ -2503,8 +2515,8 @@ cdef class DenseArrayImpl(Array):
                                 dtype=np.uint8
                             )
                     else:
-                        if (np.issubdtype(attr.dtype, np.string_) and not
-                            (np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
+                        if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) and not
+                            (np.issubdtype(attr_val.dtype, np.bytes_ if has_numpy2() else np.string_) or attr_val.dtype == np.dtype('O'))):
                             raise ValueError("Cannot write a string value to non-string "
                                             "typed attribute '{}'!".format(name))
 
@@ -2518,7 +2530,7 @@ cdef class DenseArrayImpl(Array):
                                     dtype=np.uint8
                                 )
 
-                            if np.issubdtype(attr.dtype, np.string_):
+                            if np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_):
                                 attr_val = np.array(
                                     ["" if v is None else v for v in attr_val])
                             else:
@@ -2552,8 +2564,8 @@ cdef class DenseArrayImpl(Array):
                     if attr.isnullable and name not in nullmaps:
                         nullmaps[name] = np.array([int(v is None) for v in val], dtype=np.uint8)
                 else:
-                    if (np.issubdtype(attr.dtype, np.string_) and not
-                        (np.issubdtype(val.dtype, np.string_) or val.dtype == np.dtype('O'))):
+                    if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) and not
+                        (np.issubdtype(val.dtype, np.bytes_ if has_numpy2() else np.string_) or val.dtype == np.dtype('O'))):
                         raise ValueError("Cannot write a string value to non-string "
                                         "typed attribute '{}'!".format(name))
 
@@ -3040,8 +3052,8 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
                     nullmaps[name] = np.array(
                         [int(v is not None) for v in attr_val], dtype=np.uint8)
             else:
-                if (np.issubdtype(attr.dtype, np.string_) 
-                    and not (np.issubdtype(attr_val.dtype, np.string_) 
+                if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) 
+                    and not (np.issubdtype(attr_val.dtype, np.bytes_ if has_numpy2() else np.string_) 
                     or attr_val.dtype == np.dtype('O'))):
                     raise ValueError("Cannot write a string value to non-string "
                                         "typed attribute '{}'!".format(name))
@@ -3053,7 +3065,7 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
                         nullmaps[name] = np.array(
                             [int(v is not None) for v in attr_val], dtype=np.uint8)
 
-                    if np.issubdtype(attr.dtype, np.string_):
+                    if np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_):
                         attr_val = np.array(["" if v is None else v for v in attr_val])
                     else:
                         attr_val = np.nan_to_num(attr_val)

diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py
@@ -422,7 +422,13 @@ def __init__(
         # Until list attributes are supported in core, error with a clear message.
         if use_arrow and any(
             (attr.isvar or len(attr.dtype) > 1)
-            and attr.dtype not in (np.unicode_, np.bytes_)
+            and attr.dtype
+            not in (
+                np.str_
+                if np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"
+                else np.unicode_,
+                np.bytes_,
+            )
             for attr in map(array.attr, query.attrs or ())
         ):
             raise TileDBError(

diff --git a/tiledb/tests/common.py b/tiledb/tests/common.py
@@ -23,6 +23,10 @@
 )
 
 
+def has_numpy2():
+    return np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"
+
+
 def has_pandas():
     return importlib.util.find_spec("pandas") is not None
 

diff --git a/tiledb/tests/test_attribute.py b/tiledb/tests/test_attribute.py
@@ -7,7 +7,7 @@
 
 import tiledb
 
-from .common import DiskTestCase, assert_captured, has_pandas
+from .common import DiskTestCase, assert_captured, has_numpy2, has_pandas
 
 
 class AttributeTest(DiskTestCase):
@@ -16,7 +16,7 @@ def test_minimal_attribute(self):
         self.assertEqual(attr, attr)
         self.assertTrue(attr.isanon)
         self.assertEqual(attr.name, "")
-        self.assertEqual(attr.dtype, np.float_)
+        self.assertEqual(attr.dtype, np.float64 if has_numpy2() else np.float_)
         self.assertFalse(attr.isvar)
         self.assertFalse(attr.isnullable)
 

diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py
@@ -26,6 +26,7 @@
     assert_subarrays_equal,
     assert_unordered_equal,
     fx_sparse_cell_order,  # noqa: F401
+    has_numpy2,
     has_pandas,
     has_pyarrow,
     rand_ascii,
@@ -1230,10 +1231,12 @@ def test_reopen_dense_array(self, use_timestamps):
 
     def test_data_begins_with_null_chars(self):
         path = self.path("test_data_begins_with_null_chars")
-        data = np.array(["", "", "", "a", "", "", "", "", "", "b"], dtype=np.unicode_)
-
+        data = np.array(
+            ["", "", "", "a", "", "", "", "", "", "b"],
+            dtype=np.str_ if has_numpy2() else np.unicode_,
+        )
         dom = tiledb.Domain(tiledb.Dim(domain=(1, len(data)), tile=len(data)))
-        att = tiledb.Attr(dtype=np.unicode_, var=True)
+        att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_)
         schema = tiledb.ArraySchema(dom, (att,))
         tiledb.Array.create(path, schema)
 
@@ -1325,12 +1328,12 @@ def test_varlen_write_unicode(self):
                 "",
                 "hhhhhhhhhh",
             ],
-            dtype=np.unicode_,
+            dtype=np.str_ if has_numpy2() else np.unicode_,
         )
 
         # basic write
         dom = tiledb.Domain(tiledb.Dim(domain=(1, len(A)), tile=len(A)))
-        att = tiledb.Attr(dtype=np.unicode_, var=True)
+        att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_, var=True)
 
         schema = tiledb.ArraySchema(dom, (att,))
 
@@ -1487,7 +1490,7 @@ def test_varlen_write_fixedunicode(self):
 
         # basic write
         dom = tiledb.Domain(tiledb.Dim(domain=(1, len(A)), tile=len(A)))
-        att = tiledb.Attr(dtype=np.unicode_)
+        att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_)
 
         schema = tiledb.ArraySchema(dom, (att,))
 
@@ -1991,7 +1994,7 @@ def test_sparse_bytes(self, fx_sparse_cell_order):
 
     def test_sparse_unicode(self, fx_sparse_cell_order):
         dom = tiledb.Domain(tiledb.Dim("x", domain=(1, 10000), tile=100, dtype=int))
-        att = tiledb.Attr("", var=True, dtype=np.unicode_)
+        att = tiledb.Attr("", var=True, dtype=np.str_ if has_numpy2() else np.unicode_)
         schema = tiledb.ArraySchema(
             domain=dom, attrs=(att,), sparse=True, cell_order=fx_sparse_cell_order
         )
@@ -3514,11 +3517,11 @@ def test_incomplete_dense_varlen(self, non_overlapping_ranges):
         ncells = 10
         path = self.path("incomplete_dense_varlen")
         str_data = [rand_utf8(random.randint(0, n)) for n in range(ncells)]
-        data = np.array(str_data, dtype=np.unicode_)
+        data = np.array(str_data, dtype=np.str_)
 
         # basic write
         dom = tiledb.Domain(tiledb.Dim(domain=(1, len(data)), tile=len(data)))
-        att = tiledb.Attr(dtype=np.unicode_, var=True)
+        att = tiledb.Attr(dtype=np.str_, var=True)
 
         schema = tiledb.ArraySchema(dom, (att,))
 
@@ -3556,12 +3559,12 @@ def test_incomplete_sparse_varlen(self, allows_duplicates, non_overlapping_range
 
         path = self.path("incomplete_sparse_varlen")
         str_data = [rand_utf8(random.randint(0, n)) for n in range(ncells)]
-        data = np.array(str_data, dtype=np.unicode_)
+        data = np.array(str_data, dtype=np.str_ if has_numpy2() else np.unicode_)
         coords = np.arange(ncells)
 
         # basic write
         dom = tiledb.Domain(tiledb.Dim(domain=(0, len(data) + 100), tile=len(data)))
-        att = tiledb.Attr(dtype=np.unicode_, var=True)
+        att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_, var=True)
 
         schema = tiledb.ArraySchema(
             dom, (att,), sparse=True, allows_duplicates=allows_duplicates

diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py
@@ -16,6 +16,7 @@
 
 from .common import (
     DiskTestCase,
+    # has_numpy2,
     dtype_max,
     dtype_min,
     rand_ascii,
@@ -1324,8 +1325,10 @@ def test_incomplete_df(self, allows_duplicates, non_overlapping_ranges):
         data[validity_idx] = None
 
         # TODO - not supported
-        # str_data = np.array([rand_utf8(random.randint(0, n)) for n in range(ncells)],
-        #                dtype=np.unicode_)
+        # str_data = np.array(
+        #     [rand_utf8(random.randint(0, n)) for n in range(ncells)],
+        #     dtype=np.unicode_ if has_numpy2() else np.str_,
+        # )
         # str_data[validity_idx] = None
 
         df = pd.DataFrame({"int64": pd.Series(data, dtype=pd.Int64Dtype())})