Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.08' into bug/nunique
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Jun 11, 2024
2 parents e2427ce + dfa79d4 commit 90ff627
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 19 deletions.
25 changes: 15 additions & 10 deletions .github/workflows/external_issue_labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,36 +20,41 @@ on:
types:
- opened

pull_request:
pull_request_target:
types:
- opened

env:
GITHUB_TOKEN: ${{ github.token }}

permissions:
issues: write
pull-requests: write

jobs:
Label-Issue:
runs-on: ubuntu-latest
# Only run if the issue author is not part of RAPIDS
if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}}
permissions:
issues: write
if: github.event_name == 'issues'
steps:
- name: add-external-labels
# Only run if the issue author is not part of RAPIDS
if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}}
run: |
echo ${{ github.event.issue.author_association }}
issue_url=${{ github.event.issue.html_url }}
gh issue edit ${issue_url} --add-label "External"
continue-on-error: true

Label-PR:
runs-on: ubuntu-latest
# Only run if the issue author is not part of RAPIDS
if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}}
permissions:
pull-requests: write
issues: write
if: github.event_name == 'pull_request_target'
steps:
- name: add-external-labels
# Only run if the issue author is not part of RAPIDS
if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}}
run: |
echo ${{ github.event.pull_request.author_association }}
pr_url=${{ github.event.pull_request.html_url }}
gh issue edit ${pr_url} --add-label "External"
continue-on-error: true
continue-on-error: true
2 changes: 1 addition & 1 deletion .github/workflows/pr_issue_status_automation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:

update-sprint:
# This job sets the PR and its linked issues to the current "Weekly Sprint"
uses: jarmak-nv/shared-workflows/.github/workflows/[email protected]
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: github.event.pull_request.state == 'open'
needs: get-project-id
with:
Expand Down
9 changes: 9 additions & 0 deletions docs/cudf/source/developer_guide/cudf_pandas.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ The "wrapped" types/classes are the Pandas and cuDF specific types that have bee
Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively.
In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object.
Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes.
To check if an object is a proxy type, we can use `cudf.pandas.is_proxy_object`.
```python
import cudf.pandas
cudf.pandas.install()
Expand All @@ -31,6 +32,14 @@ Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas
s1 = cudf.Series([1,2])
s2 = pd.Series([1,2])
s3 = xpd.Series([1,2])

from cudf.pandas import is_proxy_object

is_proxy_object(s1) # returns False

is_proxy_object(s2) # returns False

is_proxy_object(s3) # returns True
```

```{note}
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from .fast_slow_proxy import is_proxy_object
from .magics import load_ipython_extension
from .profiler import Profiler

__all__ = ["Profiler", "load_ipython_extension", "install"]
__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"]


LOADED = False
Expand Down
14 changes: 14 additions & 0 deletions python/cudf/cudf/pandas/fast_slow_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,6 +1185,20 @@ def _replace_closurevars(
)


def is_proxy_object(obj: Any) -> bool:
"""Determine if an object is proxy object
Parameters
----------
obj : object
Any python object.
"""
if _FastSlowProxyMeta in type(type(obj)).__mro__:
return True
return False


NUMPY_TYPES: Set[str] = set(np.sctypeDict.values())


Expand Down
16 changes: 15 additions & 1 deletion python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pytz import utc

from cudf.pandas import LOADED, Profiler
from cudf.pandas.fast_slow_proxy import _Unusable
from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object

if not LOADED:
raise ImportError("These tests must be run with cudf.pandas loaded")
Expand Down Expand Up @@ -1488,3 +1488,17 @@ def mock_mean_none(self, *args, **kwargs):

def test_excelwriter_pathlike():
assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike)


def test_is_proxy_object():
np_arr = np.array([1])

s1 = xpd.Series([1])
s2 = pd.Series([1])

np_arr_proxy = s1.to_numpy()

assert not is_proxy_object(np_arr)
assert is_proxy_object(np_arr_proxy)
assert is_proxy_object(s1)
assert not is_proxy_object(s2)
5 changes: 5 additions & 0 deletions python/dask_cudf/dask_cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from io import BufferedWriter, BytesIO, IOBase

import numpy as np
import pandas as pd
from pyarrow import dataset as pa_ds, parquet as pq

from dask import dataframe as dd
Expand Down Expand Up @@ -41,6 +42,10 @@ def _create_dd_meta(cls, dataset_info, **kwargs):
meta_pd = super()._create_dd_meta(dataset_info, **kwargs)

# Convert to cudf
# (drop unsupported timezone information)
for k, v in meta_pd.dtypes.items():
if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None:
meta_pd[k] = meta_pd[k].dt.tz_localize(None)
meta_cudf = cudf.from_pandas(meta_pd)

# Re-set "object" dtypes to align with pa schema
Expand Down
9 changes: 4 additions & 5 deletions python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,9 +610,8 @@ def test_timezone_column(tmpdir):
}
)
pdf.to_parquet(path)

# Check that `cudf` and `dask_cudf` results match
got = dask_cudf.read_parquet(path)
# cudf.read_parquet does not support reading timezone aware types yet
assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC")
got["time"] = got["time"].astype("datetime64[ns]")
expected = cudf.read_parquet(path)
dd.assert_eq(got, expected)
expect = cudf.read_parquet(path)
dd.assert_eq(got, expect)

0 comments on commit 90ff627

Please sign in to comment.