Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RELEASE] cudf v21.08.01 #8986

Merged
merged 2 commits into from
Aug 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -374,11 +374,18 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
size_t const total_char_count = std::accumulate(
cols.begin(), cols.end(), std::size_t{}, [stream](size_t a, auto const& b) -> size_t {
strings_column_view scv(b);
return a + (b.is_empty()
? 0
: cudf::detail::get_value<offset_type>(
scv.offsets(), scv.offset() + b.size(), stream) -
cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream));
return a + (scv.is_empty() ? 0
// if the column is unsliced, skip the offset retrieval.
: scv.offset() > 0
? cudf::detail::get_value<offset_type>(
scv.offsets(), scv.offset() + scv.size(), stream) -
cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream)
// if the offset() is 0, it can still be sliced to a shorter length. in this case
// we only need to read a single offset. otherwise just return the full length
// (chars_size())
: scv.size() + 1 == scv.offsets().size()
? scv.chars_size()
: cudf::detail::get_value<offset_type>(scv.offsets(), scv.size(), stream));
});
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
Expand Down
51 changes: 48 additions & 3 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pickle
from numbers import Number
from typing import Any, Dict, Optional, Tuple, Type, Union
from typing import Any, Dict, List, Optional, Tuple, Type, Union

import cupy
import numpy as np
Expand Down Expand Up @@ -588,13 +588,18 @@ def sum(self):

@classmethod
def _concat(cls, objs):
data = concat_columns([o._values for o in objs])
if all(isinstance(obj, RangeIndex) for obj in objs):
result = _concat_range_index(objs)
else:
data = concat_columns([o._values for o in objs])
result = as_index(data)

names = {obj.name for obj in objs}
if len(names) == 1:
[name] = names
else:
name = None
result = as_index(data)

result.name = name
return result

Expand Down Expand Up @@ -3043,3 +3048,43 @@ def __new__(
)

return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs)


def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
"""
An internal Utility function to concat RangeIndex objects.
"""
start = step = next_ = None

# Filter the empty indexes
non_empty_indexes = [obj for obj in indexes if len(obj)]

if not non_empty_indexes:
# Here all "indexes" had 0 length, i.e. were empty.
# In this case return an empty range index.
return RangeIndex(0, 0)

for obj in non_empty_indexes:
if start is None:
# This is set by the first non-empty index
start = obj.start
if step is None and len(obj) > 1:
step = obj.step
elif step is None:
# First non-empty index had only one element
if obj.start == start:
result = as_index(concat_columns([x._values for x in indexes]))
return result
step = obj.start - start

non_consecutive = (step != obj.step and len(obj) > 1) or (
next_ is not None and obj.start != next_
)
if non_consecutive:
result = as_index(concat_columns([x._values for x in indexes]))
return result
if step is not None:
next_ = obj[-1] + step

stop = non_empty_indexes[-1].stop if next_ is None else next_
return RangeIndex(start, stop, step)
19 changes: 19 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2316,3 +2316,22 @@ def test_get_loc_multi_string(idx, key, method):
got = gi.get_loc(key, method=method)

assert_eq(expected, got)


@pytest.mark.parametrize(
"objs",
[
[pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)],
[pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)],
[pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)],
],
)
def test_range_index_concat(objs):
cudf_objs = [cudf.from_pandas(obj) for obj in objs]

actual = cudf.concat(cudf_objs)

expected = objs[0]
for obj in objs[1:]:
expected = expected.append(obj)
assert_eq(expected, actual)
2 changes: 1 addition & 1 deletion python/dask_cudf/dask_cudf/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_from_cudf_with_generic_idx():

ddf = dgd.from_cudf(cdf, npartitions=2)

assert isinstance(ddf.index.compute(), cudf.core.index.GenericIndex)
assert isinstance(ddf.index.compute(), cudf.RangeIndex)
dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]])


Expand Down