Merge branch 'branch-24.06' into java-sanitizer-test-fix

rapidsai · May 15, 2024 · e2723fd · e2723fd
2 parents 9e438df + fa9d028
commit e2723fd
Show file tree

Hide file tree

Showing 9 changed files with 79 additions and 50 deletions.
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -26,13 +26,20 @@ include_guard(GLOBAL)
 # pyarrow.
 function(find_libarrow_in_python_wheel PYARROW_VERSION)
   string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}")
-  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_SO_VER)
-  # The soname for Arrow libraries is constructed using the major version plus "00". Note that,
-  # although it may seem like it due to Arrow almost exclusively releasing new major versions (i.e.
-  # `${MINOR_VERSION}${PATCH_VERSION}` is almost always equivalent to "00"),
-  # the soname is not generated by concatenating the major, minor, and patch versions into a single
-  # version number soname, just `${MAJOR_VERSION}00`
-  set(PYARROW_LIB "libarrow.so.${PYARROW_SO_VER}00")
+  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER)
+  list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER)
+
+  # Ensure that the major and minor versions are two digits long
+  string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH)
+  string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH)
+  if(${PYARROW_MAJOR_LENGTH} EQUAL 1)
+    set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}")
+  endif()
+  if(${PYARROW_MINOR_LENGTH} EQUAL 1)
+    set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}")
+  endif()
+
+  set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}")
 
   string(
     APPEND

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -352,8 +352,8 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to minor version
-          - pyarrow>=16.0.0,<17.0.0a0
+          # Allow runtime version to float up to patch version
+          - pyarrow>=16.0.0,<16.1.0a0
   cuda_version:
     specific:
       - output_types: conda

diff --git a/docs/cudf/source/cudf_pandas/index.rst b/docs/cudf/source/cudf_pandas/index.rst
@@ -34,10 +34,8 @@ automatically **falling back to pandas** for other operations.
 | Nothing changes, not even your `import` statements, when going from CPU to GPU.             | Combines the full flexibility of Pandas with blazing fast performance of cuDF                                       |
 +---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+
 
-Starting with the version 23.10.01 release ``cudf.pandas`` is
-available in Open Beta, as part of the ``cudf`` package .  See `RAPIDS
-Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running
-with ``cudf``.
+``cudf.pandas`` is now Generally Available (GA) as part of the ``cudf`` package.  See `RAPIDS
+Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running with ``cudf``.
 
 .. toctree::
    :maxdepth: 1

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1411,6 +1411,13 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
+def _has_any_nan(arbitrary):
+    return any(
+        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        for x in np.asarray(arbitrary)
+    )
+
+
 def column_empty_like_same_mask(
     column: ColumnBase, dtype: Dtype
 ) -> ColumnBase:
@@ -1948,9 +1955,20 @@ def as_column(
                 raise TypeError(
                     f"Cannot convert a {inferred_dtype} of object type"
                 )
-            elif nan_as_null is False and (
-                pd.isna(arbitrary).any()
+            elif inferred_dtype == "boolean":
+                if cudf.get_option("mode.pandas_compatible"):
+                    if dtype != np.dtype("bool") or pd.isna(arbitrary).any():
+                        raise MixedTypeError(
+                            f"Cannot have mixed values with {inferred_dtype}"
+                        )
+                elif nan_as_null is False and _has_any_nan(arbitrary):
+                    raise MixedTypeError(
+                        f"Cannot have mixed values with {inferred_dtype}"
+                    )
+            elif (
+                nan_as_null is False
                 and inferred_dtype not in ("decimal", "empty")
+                and _has_any_nan(arbitrary)
             ):
                 # Decimal can hold float("nan")
                 # All np.nan is not restricted by type

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -405,12 +405,12 @@ def _setitem_tuple_arg(self, key, value):
                 value = as_column(value, length=length)
 
             new_col = cudf.Series(value, index=idx)
-            if not self._frame.empty:
+            if len(self._frame.index) != 0:
                 new_col = new_col._align_to_index(
                     self._frame.index, how="right"
                 )
 
-            if self._frame.empty:
+            if len(self._frame.index) == 0:
                 self._frame.index = (
                     idx if idx is not None else cudf.RangeIndex(len(new_col))
                 )

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -4008,44 +4008,28 @@ def test_diff(dtype, period, data_empty):
 
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_isnull_isna(df, nan_as_null):
-    if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
-    ):
-        with pytest.raises(MixedTypeError):
-            cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
-    else:
-        gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
+@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"])
+def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call):
+    def detect_nan(x):
+        # Check if the input is a float and if it is nan
+        return x.apply(lambda v: isinstance(v, float) and np.isnan(v))
 
-        assert_eq(df.isnull(), gdf.isnull())
-        assert_eq(df.isna(), gdf.isna())
-
-        # Test individual columns
-        for col in df:
-            assert_eq(df[col].isnull(), gdf[col].isnull())
-            assert_eq(df[col].isna(), gdf[col].isna())
-
-
-@pytest.mark.parametrize("df", _dataframe_na_data())
-@pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_notna_notnull(df, nan_as_null):
+    nan_contains = df.select_dtypes(object).apply(detect_nan)
     if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
+        nan_contains.any().any() and not nan_contains.all().all()
     ):
         with pytest.raises(MixedTypeError):
             cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
     else:
         gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
-        assert_eq(df.notnull(), gdf.notnull())
-        assert_eq(df.notna(), gdf.notna())
+        assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)())
 
         # Test individual columns
         for col in df:
-            assert_eq(df[col].notnull(), gdf[col].notnull())
-            assert_eq(df[col].notna(), gdf[col].notna())
+            assert_eq(
+                getattr(df[col], api_call)(), getattr(gdf[col], api_call)()
+            )
 
 
 def test_ndim():

diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
@@ -2255,3 +2255,12 @@ def test_scalar_loc_row_categoricalindex():
     result = df.loc["a"]
     expected = df.to_pandas().loc["a"]
     assert_eq(result, expected)
+
+
+def test_loc_setitem_empty_dataframe():
+    pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"])
+    gdf = cudf.from_pandas(pdf)
+    pdf.loc[["index_1"], "new_col"] = "A"
+    gdf.loc[["index_1"], "new_col"] = "A"
+
+    assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -774,8 +774,9 @@ def test_round_nan_as_null_false(series, decimal):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -789,8 +790,9 @@ def test_series_isnull_isna(ps, nan_as_null):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_notnull_notna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -2356,12 +2358,23 @@ def test_multi_dim_series_error():
 
 def test_bool_series_mixed_dtype_error():
     ps = pd.Series([True, False, None])
+    all_bool_ps = pd.Series([True, False, True], dtype="object")
     # ps now has `object` dtype, which
     # isn't supported by `cudf`.
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(TypeError):
+            cudf.Series(ps)
+        with pytest.raises(TypeError):
+            cudf.from_pandas(ps)
+        with pytest.raises(TypeError):
+            cudf.Series(ps, dtype=bool)
+        expected = cudf.Series(all_bool_ps, dtype=bool)
+        assert_eq(expected, all_bool_ps.astype(bool))
+    nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object")
+    gs = cudf.Series(nan_bools_mix, nan_as_null=True)
+    assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean"))
     with pytest.raises(TypeError):
-        cudf.Series(ps, nan_as_null=False)
-    with pytest.raises(TypeError):
-        cudf.from_pandas(ps, nan_as_null=False)
+        cudf.Series(nan_bools_mix, nan_as_null=False)
 
 
 @pytest.mark.parametrize(

diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.0.0,<17.0.0a0",
+    "pyarrow>=16.0.0,<16.1.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",