diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5239d37dc3..f3fd8e17ca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,6 @@ repos: hooks: - id: black - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.284 + rev: v0.4.4 hooks: - id: ruff diff --git a/pyproject.toml b/pyproject.toml index e3ec53a8f2..5b95ea188f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,5 +72,8 @@ extend-select = ["I001"] extend-exclude = ["doc"] fix = true +[tool.ruff.lint] +# select = ["NPY201"] Enabling it will cause all types to be replaced by numpy2 + [tool.ruff.per-file-ignores] "tiledb/__init__.py" = ["F401"] diff --git a/setup.py b/setup.py index 99a119e4c5..2b0f6c5858 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import sys from ctypes import CDLL, POINTER, Structure, byref, c_char_p, c_int, c_void_p -from pkg_resources import resource_filename +import numpy as np from pybind11.setup_helpers import Pybind11Extension from setuptools import Extension, find_packages, setup @@ -478,16 +478,8 @@ class build_ext(cython_build_ext): """ def build_extensions(self): - """ - Lazily append numpy's include directory to Extension includes. - - This is done here rather than at module scope because setup.py - may be run before numpy has been installed, in which case - importing numpy and calling `numpy.get_include()` will fail. - """ - numpy_incl = resource_filename("numpy", "core/include") for ext in self.extensions: - ext.include_dirs.append(numpy_incl) + ext.include_dirs.append(np.get_include()) find_or_install_libtiledb(self) diff --git a/tiledb/common.pxi b/tiledb/common.pxi index 85137e4325..715a29c906 100644 --- a/tiledb/common.pxi +++ b/tiledb/common.pxi @@ -25,6 +25,7 @@ from libc.stdio cimport FILE, stdout from libc.stdlib cimport calloc, free, malloc from libc.string cimport memcpy from libcpp.vector cimport vector +from libcpp cimport bool as bool_t cdef extern from "Python.h": diff --git a/tiledb/highlevel.py b/tiledb/highlevel.py index 6bc4ab1cea..d0e743d177 100644 --- a/tiledb/highlevel.py +++ b/tiledb/highlevel.py @@ -224,7 +224,13 @@ def is_ndarray_like(arr): elif shape and dtype: if np.issubdtype(np.bytes_, dtype): dtype = np.dtype("S") - elif np.issubdtype(dtype, np.unicode_): + elif ( + np.lib.NumpyVersion(np.__version__) >= "2.0.0b1" + and np.issubdtype(dtype, np.str_) + ) or ( + np.lib.NumpyVersion(np.__version__) < "2.0.0b1" + and np.issubdtype(dtype, np.unicode_) + ): dtype = np.dtype("U") ndim = len(shape) diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index 585dc6cf5b..45c10d07d5 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -12,8 +12,7 @@ import io import warnings import collections.abc from collections import OrderedDict -from json import dumps as json_dumps -from json import loads as json_loads +from json import dumps as json_dumps, loads as json_loads from ._generated_version import version_tuple as tiledbpy_version from .array_schema import ArraySchema @@ -28,6 +27,7 @@ from .vfs import VFS # https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.import_array np.import_array() +np.set_printoptions(legacy='1.21') # use unified numpy printing ############################################################################### # Utility/setup # @@ -36,10 +36,9 @@ np.import_array() # Integer types supported by Python / System _inttypes = (int, np.integer) -# Numpy initialization code (critical) -# https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.import_array -np.import_array() +cdef bool_t has_numpy2(): + return np.lib.NumpyVersion(np.__version__) >= '2.0.0b1' cdef tiledb_ctx_t* safe_ctx_ptr(object ctx): if ctx is None: @@ -145,9 +144,11 @@ cdef _write_array( if attr.isvar: try: if attr.isnullable: - if(np.issubdtype(attr.dtype, np.unicode_) - or np.issubdtype(attr.dtype, np.string_) - or np.issubdtype(attr.dtype, np.bytes_)): + if ( + np.issubdtype(attr.dtype, np.bytes_) or + has_numpy2() and np.issubdtype(attr.dtype, np.str_) or + not has_numpy2() and (np.issubdtype(attr.dtype, np.unicode_) or np.issubdtype(attr.dtype, np.string_)) + ): attr_val = np.array(["" if v is None else v for v in values[i]]) else: attr_val = np.nan_to_num(values[i]) @@ -601,7 +602,13 @@ def index_domain_subarray(array: Array, dom, idx: tuple): dim = dom.dim(r) dim_dtype = dim.dtype - if array.mode == 'r' and (np.issubdtype(dim_dtype, np.unicode_) or np.issubdtype(dim_dtype, np.bytes_)): + if array.mode == 'r' and ( + (has_numpy2() and np.issubdtype(dim_dtype, np.str_)) + or + (not has_numpy2() and np.issubdtype(dim_dtype, np.unicode_)) + or + np.issubdtype(dim_dtype, np.bytes_) + ): # NED can only be retrieved in read mode ned = array.nonempty_domain() (dim_lb, dim_ub) = ned[r] if ned else (None, None) @@ -612,7 +619,11 @@ def index_domain_subarray(array: Array, dom, idx: tuple): if not isinstance(dim_slice, slice): raise IndexError("invalid index type: {!r}".format(type(dim_slice))) + # numpy2 doesn't allow addition beween int and np.int64 start, stop, step = dim_slice.start, dim_slice.stop, dim_slice.step + start = np.int64(start) if isinstance(start, int) else start + stop = np.int64(stop) if isinstance(stop, int) else stop + step = np.int64(step) if isinstance(step, int) else step if np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_): if start is None or stop is None: @@ -1489,7 +1500,8 @@ cdef class Array(object): cdef _ndarray_is_varlen(self, np.ndarray array): return (np.issubdtype(array.dtype, np.bytes_) or - np.issubdtype(array.dtype, np.unicode_) or + (has_numpy2() and np.issubdtype(array.dtype, np.str_)) or + (not has_numpy2() and np.issubdtype(array.dtype, np.unicode_)) or array.dtype == object) @property @@ -2503,8 +2515,8 @@ cdef class DenseArrayImpl(Array): dtype=np.uint8 ) else: - if (np.issubdtype(attr.dtype, np.string_) and not - (np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))): + if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) and not + (np.issubdtype(attr_val.dtype, np.bytes_ if has_numpy2() else np.string_) or attr_val.dtype == np.dtype('O'))): raise ValueError("Cannot write a string value to non-string " "typed attribute '{}'!".format(name)) @@ -2518,7 +2530,7 @@ cdef class DenseArrayImpl(Array): dtype=np.uint8 ) - if np.issubdtype(attr.dtype, np.string_): + if np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_): attr_val = np.array( ["" if v is None else v for v in attr_val]) else: @@ -2552,8 +2564,8 @@ cdef class DenseArrayImpl(Array): if attr.isnullable and name not in nullmaps: nullmaps[name] = np.array([int(v is None) for v in val], dtype=np.uint8) else: - if (np.issubdtype(attr.dtype, np.string_) and not - (np.issubdtype(val.dtype, np.string_) or val.dtype == np.dtype('O'))): + if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) and not + (np.issubdtype(val.dtype, np.bytes_ if has_numpy2() else np.string_) or val.dtype == np.dtype('O'))): raise ValueError("Cannot write a string value to non-string " "typed attribute '{}'!".format(name)) @@ -3040,8 +3052,8 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps): nullmaps[name] = np.array( [int(v is not None) for v in attr_val], dtype=np.uint8) else: - if (np.issubdtype(attr.dtype, np.string_) - and not (np.issubdtype(attr_val.dtype, np.string_) + if (np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_) + and not (np.issubdtype(attr_val.dtype, np.bytes_ if has_numpy2() else np.string_) or attr_val.dtype == np.dtype('O'))): raise ValueError("Cannot write a string value to non-string " "typed attribute '{}'!".format(name)) @@ -3053,7 +3065,7 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps): nullmaps[name] = np.array( [int(v is not None) for v in attr_val], dtype=np.uint8) - if np.issubdtype(attr.dtype, np.string_): + if np.issubdtype(attr.dtype, np.bytes_ if has_numpy2() else np.string_): attr_val = np.array(["" if v is None else v for v in attr_val]) else: attr_val = np.nan_to_num(attr_val) diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index 010836f18f..d6ee46d6fd 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -422,7 +422,13 @@ def __init__( # Until list attributes are supported in core, error with a clear message. if use_arrow and any( (attr.isvar or len(attr.dtype) > 1) - and attr.dtype not in (np.unicode_, np.bytes_) + and attr.dtype + not in ( + np.str_ + if np.lib.NumpyVersion(np.__version__) >= "2.0.0b1" + else np.unicode_, + np.bytes_, + ) for attr in map(array.attr, query.attrs or ()) ): raise TileDBError( diff --git a/tiledb/tests/common.py b/tiledb/tests/common.py index 305944331d..354a725f28 100644 --- a/tiledb/tests/common.py +++ b/tiledb/tests/common.py @@ -23,6 +23,10 @@ ) +def has_numpy2(): + return np.lib.NumpyVersion(np.__version__) >= "2.0.0b1" + + def has_pandas(): return importlib.util.find_spec("pandas") is not None diff --git a/tiledb/tests/test_attribute.py b/tiledb/tests/test_attribute.py index 20e4083c35..abc00f54c1 100644 --- a/tiledb/tests/test_attribute.py +++ b/tiledb/tests/test_attribute.py @@ -7,7 +7,7 @@ import tiledb -from .common import DiskTestCase, assert_captured, has_pandas +from .common import DiskTestCase, assert_captured, has_numpy2, has_pandas class AttributeTest(DiskTestCase): @@ -16,7 +16,7 @@ def test_minimal_attribute(self): self.assertEqual(attr, attr) self.assertTrue(attr.isanon) self.assertEqual(attr.name, "") - self.assertEqual(attr.dtype, np.float_) + self.assertEqual(attr.dtype, np.float64 if has_numpy2() else np.float_) self.assertFalse(attr.isvar) self.assertFalse(attr.isnullable) diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index cd13ad9cde..4c00a887fd 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -26,6 +26,7 @@ assert_subarrays_equal, assert_unordered_equal, fx_sparse_cell_order, # noqa: F401 + has_numpy2, has_pandas, has_pyarrow, rand_ascii, @@ -1230,10 +1231,12 @@ def test_reopen_dense_array(self, use_timestamps): def test_data_begins_with_null_chars(self): path = self.path("test_data_begins_with_null_chars") - data = np.array(["", "", "", "a", "", "", "", "", "", "b"], dtype=np.unicode_) - + data = np.array( + ["", "", "", "a", "", "", "", "", "", "b"], + dtype=np.str_ if has_numpy2() else np.unicode_, + ) dom = tiledb.Domain(tiledb.Dim(domain=(1, len(data)), tile=len(data))) - att = tiledb.Attr(dtype=np.unicode_, var=True) + att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_) schema = tiledb.ArraySchema(dom, (att,)) tiledb.Array.create(path, schema) @@ -1325,12 +1328,12 @@ def test_varlen_write_unicode(self): "", "hhhhhhhhhh", ], - dtype=np.unicode_, + dtype=np.str_ if has_numpy2() else np.unicode_, ) # basic write dom = tiledb.Domain(tiledb.Dim(domain=(1, len(A)), tile=len(A))) - att = tiledb.Attr(dtype=np.unicode_, var=True) + att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_, var=True) schema = tiledb.ArraySchema(dom, (att,)) @@ -1487,7 +1490,7 @@ def test_varlen_write_fixedunicode(self): # basic write dom = tiledb.Domain(tiledb.Dim(domain=(1, len(A)), tile=len(A))) - att = tiledb.Attr(dtype=np.unicode_) + att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_) schema = tiledb.ArraySchema(dom, (att,)) @@ -1991,7 +1994,7 @@ def test_sparse_bytes(self, fx_sparse_cell_order): def test_sparse_unicode(self, fx_sparse_cell_order): dom = tiledb.Domain(tiledb.Dim("x", domain=(1, 10000), tile=100, dtype=int)) - att = tiledb.Attr("", var=True, dtype=np.unicode_) + att = tiledb.Attr("", var=True, dtype=np.str_ if has_numpy2() else np.unicode_) schema = tiledb.ArraySchema( domain=dom, attrs=(att,), sparse=True, cell_order=fx_sparse_cell_order ) @@ -3514,11 +3517,11 @@ def test_incomplete_dense_varlen(self, non_overlapping_ranges): ncells = 10 path = self.path("incomplete_dense_varlen") str_data = [rand_utf8(random.randint(0, n)) for n in range(ncells)] - data = np.array(str_data, dtype=np.unicode_) + data = np.array(str_data, dtype=np.str_) # basic write dom = tiledb.Domain(tiledb.Dim(domain=(1, len(data)), tile=len(data))) - att = tiledb.Attr(dtype=np.unicode_, var=True) + att = tiledb.Attr(dtype=np.str_, var=True) schema = tiledb.ArraySchema(dom, (att,)) @@ -3556,12 +3559,12 @@ def test_incomplete_sparse_varlen(self, allows_duplicates, non_overlapping_range path = self.path("incomplete_sparse_varlen") str_data = [rand_utf8(random.randint(0, n)) for n in range(ncells)] - data = np.array(str_data, dtype=np.unicode_) + data = np.array(str_data, dtype=np.str_ if has_numpy2() else np.unicode_) coords = np.arange(ncells) # basic write dom = tiledb.Domain(tiledb.Dim(domain=(0, len(data) + 100), tile=len(data))) - att = tiledb.Attr(dtype=np.unicode_, var=True) + att = tiledb.Attr(dtype=np.str_ if has_numpy2() else np.unicode_, var=True) schema = tiledb.ArraySchema( dom, (att,), sparse=True, allows_duplicates=allows_duplicates diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py index 51649e7bfc..4e913a6729 100644 --- a/tiledb/tests/test_pandas_dataframe.py +++ b/tiledb/tests/test_pandas_dataframe.py @@ -16,6 +16,7 @@ from .common import ( DiskTestCase, + # has_numpy2, dtype_max, dtype_min, rand_ascii, @@ -1324,8 +1325,10 @@ def test_incomplete_df(self, allows_duplicates, non_overlapping_ranges): data[validity_idx] = None # TODO - not supported - # str_data = np.array([rand_utf8(random.randint(0, n)) for n in range(ncells)], - # dtype=np.unicode_) + # str_data = np.array( + # [rand_utf8(random.randint(0, n)) for n in range(ncells)], + # dtype=np.unicode_ if has_numpy2() else np.str_, + # ) # str_data[validity_idx] = None df = pd.DataFrame({"int64": pd.Series(data, dtype=pd.Int64Dtype())})