Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into java-sanitizer-test-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
jlowe authored May 15, 2024
2 parents 9e438df + fa9d028 commit e2723fd
Show file tree
Hide file tree
Showing 9 changed files with 79 additions and 50 deletions.
21 changes: 14 additions & 7 deletions cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,20 @@ include_guard(GLOBAL)
# pyarrow.
function(find_libarrow_in_python_wheel PYARROW_VERSION)
string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}")
list(GET PYARROW_VER_COMPONENTS 0 PYARROW_SO_VER)
# The soname for Arrow libraries is constructed using the major version plus "00". Note that,
# although it may seem like it due to Arrow almost exclusively releasing new major versions (i.e.
# `${MINOR_VERSION}${PATCH_VERSION}` is almost always equivalent to "00"),
# the soname is not generated by concatenating the major, minor, and patch versions into a single
# version number soname, just `${MAJOR_VERSION}00`
set(PYARROW_LIB "libarrow.so.${PYARROW_SO_VER}00")
list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER)
list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER)

# Ensure that the major and minor versions are two digits long
string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH)
string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH)
if(${PYARROW_MAJOR_LENGTH} EQUAL 1)
set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}")
endif()
if(${PYARROW_MINOR_LENGTH} EQUAL 1)
set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}")
endif()

set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}")

string(
APPEND
Expand Down
4 changes: 2 additions & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,8 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
# Allow runtime version to float up to minor version
- pyarrow>=16.0.0,<17.0.0a0
# Allow runtime version to float up to patch version
- pyarrow>=16.0.0,<16.1.0a0
cuda_version:
specific:
- output_types: conda
Expand Down
6 changes: 2 additions & 4 deletions docs/cudf/source/cudf_pandas/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,8 @@ automatically **falling back to pandas** for other operations.
| Nothing changes, not even your `import` statements, when going from CPU to GPU. | Combines the full flexibility of Pandas with blazing fast performance of cuDF |
+---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+

Starting with the version 23.10.01 release ``cudf.pandas`` is
available in Open Beta, as part of the ``cudf`` package . See `RAPIDS
Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running
with ``cudf``.
``cudf.pandas`` is now Generally Available (GA) as part of the ``cudf`` package. See `RAPIDS
Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running with ``cudf``.

.. toctree::
:maxdepth: 1
Expand Down
22 changes: 20 additions & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1411,6 +1411,13 @@ def column_empty_like(
return column_empty(row_count, dtype, masked)


def _has_any_nan(arbitrary):
return any(
((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
for x in np.asarray(arbitrary)
)


def column_empty_like_same_mask(
column: ColumnBase, dtype: Dtype
) -> ColumnBase:
Expand Down Expand Up @@ -1948,9 +1955,20 @@ def as_column(
raise TypeError(
f"Cannot convert a {inferred_dtype} of object type"
)
elif nan_as_null is False and (
pd.isna(arbitrary).any()
elif inferred_dtype == "boolean":
if cudf.get_option("mode.pandas_compatible"):
if dtype != np.dtype("bool") or pd.isna(arbitrary).any():
raise MixedTypeError(
f"Cannot have mixed values with {inferred_dtype}"
)
elif nan_as_null is False and _has_any_nan(arbitrary):
raise MixedTypeError(
f"Cannot have mixed values with {inferred_dtype}"
)
elif (
nan_as_null is False
and inferred_dtype not in ("decimal", "empty")
and _has_any_nan(arbitrary)
):
# Decimal can hold float("nan")
# All np.nan is not restricted by type
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,12 @@ def _setitem_tuple_arg(self, key, value):
value = as_column(value, length=length)

new_col = cudf.Series(value, index=idx)
if not self._frame.empty:
if len(self._frame.index) != 0:
new_col = new_col._align_to_index(
self._frame.index, how="right"
)

if self._frame.empty:
if len(self._frame.index) == 0:
self._frame.index = (
idx if idx is not None else cudf.RangeIndex(len(new_col))
)
Expand Down
38 changes: 11 additions & 27 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4008,44 +4008,28 @@ def test_diff(dtype, period, data_empty):

@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_isnull_isna(df, nan_as_null):
if nan_as_null is False and (
df.select_dtypes(object).isna().any().any()
and not df.select_dtypes(object).isna().all().all()
):
with pytest.raises(MixedTypeError):
cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
else:
gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"])
def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call):
def detect_nan(x):
# Check if the input is a float and if it is nan
return x.apply(lambda v: isinstance(v, float) and np.isnan(v))

assert_eq(df.isnull(), gdf.isnull())
assert_eq(df.isna(), gdf.isna())

# Test individual columns
for col in df:
assert_eq(df[col].isnull(), gdf[col].isnull())
assert_eq(df[col].isna(), gdf[col].isna())


@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_notna_notnull(df, nan_as_null):
nan_contains = df.select_dtypes(object).apply(detect_nan)
if nan_as_null is False and (
df.select_dtypes(object).isna().any().any()
and not df.select_dtypes(object).isna().all().all()
nan_contains.any().any() and not nan_contains.all().all()
):
with pytest.raises(MixedTypeError):
cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
else:
gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)

assert_eq(df.notnull(), gdf.notnull())
assert_eq(df.notna(), gdf.notna())
assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)())

# Test individual columns
for col in df:
assert_eq(df[col].notnull(), gdf[col].notnull())
assert_eq(df[col].notna(), gdf[col].notna())
assert_eq(
getattr(df[col], api_call)(), getattr(gdf[col], api_call)()
)


def test_ndim():
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2255,3 +2255,12 @@ def test_scalar_loc_row_categoricalindex():
result = df.loc["a"]
expected = df.to_pandas().loc["a"]
assert_eq(result, expected)


def test_loc_setitem_empty_dataframe():
pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"])
gdf = cudf.from_pandas(pdf)
pdf.loc[["index_1"], "new_col"] = "A"
gdf.loc[["index_1"], "new_col"] = "A"

assert_eq(pdf, gdf)
23 changes: 18 additions & 5 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,8 +774,9 @@ def test_round_nan_as_null_false(series, decimal):
@pytest.mark.parametrize("ps", _series_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_series_isnull_isna(ps, nan_as_null):
nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
if nan_as_null is False and (
ps.isna().any() and not ps.isna().all() and ps.dtype == object
nan_contains.any() and not nan_contains.all() and ps.dtype == object
):
with pytest.raises(MixedTypeError):
cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
Expand All @@ -789,8 +790,9 @@ def test_series_isnull_isna(ps, nan_as_null):
@pytest.mark.parametrize("ps", _series_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_series_notnull_notna(ps, nan_as_null):
nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
if nan_as_null is False and (
ps.isna().any() and not ps.isna().all() and ps.dtype == object
nan_contains.any() and not nan_contains.all() and ps.dtype == object
):
with pytest.raises(MixedTypeError):
cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
Expand Down Expand Up @@ -2356,12 +2358,23 @@ def test_multi_dim_series_error():

def test_bool_series_mixed_dtype_error():
ps = pd.Series([True, False, None])
all_bool_ps = pd.Series([True, False, True], dtype="object")
# ps now has `object` dtype, which
# isn't supported by `cudf`.
with cudf.option_context("mode.pandas_compatible", True):
with pytest.raises(TypeError):
cudf.Series(ps)
with pytest.raises(TypeError):
cudf.from_pandas(ps)
with pytest.raises(TypeError):
cudf.Series(ps, dtype=bool)
expected = cudf.Series(all_bool_ps, dtype=bool)
assert_eq(expected, all_bool_ps.astype(bool))
nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object")
gs = cudf.Series(nan_bools_mix, nan_as_null=True)
assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean"))
with pytest.raises(TypeError):
cudf.Series(ps, nan_as_null=False)
with pytest.raises(TypeError):
cudf.from_pandas(ps, nan_as_null=False)
cudf.Series(nan_bools_mix, nan_as_null=False)


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = [
"packaging",
"pandas>=2.0,<2.2.3dev0",
"ptxcompiler",
"pyarrow>=16.0.0,<17.0.0a0",
"pyarrow>=16.0.0,<16.1.0a0",
"rich",
"rmm==24.6.*",
"typing_extensions>=4.0.0",
Expand Down

0 comments on commit e2723fd

Please sign in to comment.