From 56430b46a6494cbec90b4c085085a905631be55f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 15 Dec 2021 16:09:38 -0800 Subject: [PATCH 1/2] Use pandas `to_offset` to parse frequency string in `date_range` (#9843) Pandas uses the [following regex](https://github.com/pandas-dev/pandas/blob/8fefaa5a9a7c3f3a1c35c36c1140117dab73c9c7/pandas/_libs/tslibs/offsets.pyx#L3506-L3508) to convert freqeuncy strings into offset, which is exposed in public api `pd.tseries.frequencies.to_offset`. Currently cudf depends on a [custom regex](https://github.com/rapidsai/cudf/blob/fdd9bb00dc0ba5ac373feaa079b782029130dae3/python/cudf/cudf/core/tools/datetimes.py#L819-L821) to perform the conversion. We probably shouldn't reinvent the wheels here as it might make it harder to track changes in pandas. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Sheilah Kirui (https://github.com/skirui-source) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9843 --- python/cudf/cudf/core/tools/datetimes.py | 41 ++++++++++++++--------- python/cudf/cudf/tests/test_datetime.py | 42 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 3efbd982b53..15426d0173a 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -8,6 +8,7 @@ import cupy as cp import numpy as np import pandas as pd +import pandas.tseries.offsets as pd_offset from pandas.core.tools.datetimes import _unit_map import cudf @@ -458,6 +459,17 @@ class DateOffset: "Y": "years", } + _TICK_OR_WEEK_TO_UNITS = { + pd_offset.Week: "weeks", + pd_offset.Day: "days", + pd_offset.Hour: "hours", + pd_offset.Minute: "minutes", + pd_offset.Second: "seconds", + pd_offset.Milli: "milliseconds", + pd_offset.Micro: "microseconds", + pd_offset.Nano: "nanoseconds", + } + _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): @@ -649,6 +661,13 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) + @classmethod + def _from_pandas_ticks_or_weeks( + cls: Type[_T], + tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week], + ) -> _T: + return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) + def _maybe_as_fast_pandas_offset(self): if ( len(self.kwds) == 1 @@ -814,23 +833,15 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - # Map pandas `offset alias` into cudf DateOffset `CODE`, only - # fixed-frequency, non-anchored offset aliases are supported. - mo = re.fullmatch( - rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq - ) - if mo is None: + offset = pd.tseries.frequencies.to_offset(freq) + if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance( + offset, pd.tseries.offsets.Week + ): raise ValueError( - f"Unrecognized or unsupported offset alias {freq}." + f"Unrecognized frequency string {freq}. cuDF does " + "not yet support month, quarter, year-anchored frequency." ) - - sign, n, offset_alias = mo.groups() - code = _offset_alias_to_code[offset_alias] - - freq = "".join([n, code]) - offset = DateOffset._from_freqstr(freq) - if sign: - offset.kwds.update({s: -i for s, i in offset.kwds.items()}) + offset = DateOffset._from_pandas_ticks_or_weeks(offset) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 72601a3da2c..1a1b21aa3d5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1583,6 +1583,48 @@ def test_date_range_raise_overflow(): cudf.date_range(start=start, periods=periods, freq=freq) +@pytest.mark.parametrize( + "freqstr_unsupported", + [ + "1M", + "2SM", + "3MS", + "4BM", + "5CBM", + "6SMS", + "7BMS", + "8CBMS", + "Q", + "2BQ", + "3BQS", + "10A", + "10Y", + "9BA", + "9BY", + "8AS", + "8YS", + "7BAS", + "7BYS", + "BH", + "B", + ], +) +def test_date_range_raise_unsupported(freqstr_unsupported): + s, e = "2001-01-01", "2008-01-31" + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + # We also check that these values are unsupported when using lowercase + # characters. We exclude the value 3MS (every 3 month starts) because 3ms + # is a valid frequency for every 3 milliseconds. + if freqstr_unsupported != "3MS": + freqstr_unsupported = freqstr_unsupported.lower() + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + ################################################################## # End of Date Range Test # ################################################################## From 52d7acc29e788e27d066e99eecda142170c9e746 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra <36027403+codereport@users.noreply.github.com> Date: Wed, 15 Dec 2021 21:50:55 -0500 Subject: [PATCH 2/2] Replace `thrust/std::get` with structure bindings (#9915) While watching @bdice and @isVoid's Better Code talk, I noticed a pair of `thrust::get`s that could be replaced by structured bindings. I did a quick search and found all the places the clean up could be done. That is what this PR does. Authors: - Conor Hoekstra (https://github.com/codereport) Approvers: - Nghia Truong (https://github.com/ttnghia) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/9915 --- cpp/include/cudf/detail/merge.cuh | 8 ++------ cpp/include/cudf/strings/detail/merge.cuh | 7 ++----- cpp/src/dictionary/detail/merge.cu | 6 ++---- cpp/src/merge/merge.cu | 7 ++----- cpp/tests/copying/sample_tests.cpp | 5 ++--- cpp/tests/interop/from_arrow_test.cpp | 9 ++++----- cpp/tests/interop/to_arrow_test.cpp | 9 ++++----- cpp/tests/io/orc_test.cpp | 8 +++----- cpp/tests/transform/mask_to_bools_test.cpp | 3 +-- 9 files changed, 22 insertions(+), 40 deletions(-) diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh index f141d9b5d59..ee5cb5c265d 100644 --- a/cpp/include/cudf/detail/merge.cuh +++ b/cpp/include/cudf/detail/merge.cuh @@ -80,14 +80,10 @@ struct tagged_element_relational_comparator { __device__ weak_ordering compare(index_type lhs_tagged_index, index_type rhs_tagged_index) const noexcept { - side const l_side = thrust::get<0>(lhs_tagged_index); - side const r_side = thrust::get<0>(rhs_tagged_index); - - cudf::size_type const l_indx = thrust::get<1>(lhs_tagged_index); - cudf::size_type const r_indx = thrust::get<1>(rhs_tagged_index); + auto const [l_side, l_indx] = lhs_tagged_index; + auto const [r_side, r_indx] = rhs_tagged_index; column_device_view const* ptr_left_dview{l_side == side::LEFT ? &lhs : &rhs}; - column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs}; auto erl_comparator = element_relational_comparator( diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh index a132d8c7229..dba1c24be93 100644 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ b/cpp/include/cudf/strings/detail/merge.cuh @@ -68,8 +68,7 @@ std::unique_ptr merge(strings_column_view const& lhs, // build offsets column auto offsets_transformer = [d_lhs, d_rhs] __device__(auto index_pair) { - auto side = thrust::get<0>(index_pair); - auto index = thrust::get<1>(index_pair); + auto const [side, index] = index_pair; if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return 0; auto d_str = side == side::LEFT ? d_lhs.element(index) : d_rhs.element(index); @@ -90,9 +89,7 @@ std::unique_ptr merge(strings_column_view const& lhs, thrust::make_counting_iterator(0), strings_count, [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) { - index_type index_pair = begin[idx]; - auto side = thrust::get<0>(index_pair); - auto index = thrust::get<1>(index_pair); + auto const [side, index] = begin[idx]; if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return; auto d_str = side == side::LEFT ? d_lhs.element(index) : d_rhs.element(index); diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu index e972403cad3..a194f4add2e 100644 --- a/cpp/src/dictionary/detail/merge.cu +++ b/cpp/src/dictionary/detail/merge.cu @@ -53,10 +53,8 @@ std::unique_ptr merge(dictionary_column_view const& lcol, row_order.end(), output_iter, [lcol_iter, rcol_iter] __device__(auto const& index_pair) { - auto index = thrust::get<1>(index_pair); - return (thrust::get<0>(index_pair) == cudf::detail::side::LEFT - ? lcol_iter[index] - : rcol_iter[index]); + auto const [side, index] = index_pair; + return side == cudf::detail::side::LEFT ? lcol_iter[index] : rcol_iter[index]; }); // build dictionary; the validity mask is updated by the caller diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index f7e9b114f7b..ff9401022b2 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -80,9 +80,7 @@ __global__ void materialize_merged_bitmask_kernel( auto active_threads = __ballot_sync(0xffffffff, destination_row < num_destination_rows); while (destination_row < num_destination_rows) { - index_type const& merged_idx = merged_indices[destination_row]; - side const src_side = thrust::get<0>(merged_idx); - size_type const src_row = thrust::get<1>(merged_idx); + auto const [src_side, src_row] = merged_indices[destination_row]; bool const from_left{src_side == side::LEFT}; bool source_bit_is_valid{true}; if (left_have_valids && from_left) { @@ -284,8 +282,7 @@ struct column_merger { row_order_.end(), merged_view.begin(), [d_lcol, d_rcol] __device__(index_type const& index_pair) { - auto side = thrust::get<0>(index_pair); - auto index = thrust::get<1>(index_pair); + auto const [side, index] = index_pair; return side == side::LEFT ? d_lcol[index] : d_rcol[index]; }); diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp index 4da1b541a65..8cb2b9ce74e 100644 --- a/cpp/tests/copying/sample_tests.cpp +++ b/cpp/tests/copying/sample_tests.cpp @@ -89,9 +89,8 @@ struct SampleBasicTest : public SampleTest, TEST_P(SampleBasicTest, CombinationOfParameters) { - cudf::size_type const table_size = 1024; - cudf::size_type const n_samples = std::get<0>(GetParam()); - cudf::sample_with_replacement multi_smpl = std::get<1>(GetParam()); + cudf::size_type const table_size = 1024; + auto const [n_samples, multi_smpl] = GetParam(); auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); cudf::test::fixed_width_column_wrapper col1(data, data + table_size); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 52d5da8f6e5..946ac7fc891 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -333,11 +333,10 @@ struct FromArrowTestSlice TEST_P(FromArrowTestSlice, SliceTest) { - auto tables = get_tables(10000); - auto cudf_table_view = tables.first->view(); - auto arrow_table = tables.second; - auto start = std::get<0>(GetParam()); - auto end = std::get<1>(GetParam()); + auto tables = get_tables(10000); + auto cudf_table_view = tables.first->view(); + auto arrow_table = tables.second; + auto const [start, end] = GetParam(); auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; auto expected_cudf_table = cudf::table{sliced_cudf_table}; diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 9ad546d3e01..98031f42a9c 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -488,11 +488,10 @@ struct ToArrowTestSlice TEST_P(ToArrowTestSlice, SliceTest) { - auto tables = get_tables(10000); - auto cudf_table_view = tables.first->view(); - auto arrow_table = tables.second; - auto start = std::get<0>(GetParam()); - auto end = std::get<1>(GetParam()); + auto tables = get_tables(10000); + auto cudf_table_view = tables.first->view(); + auto arrow_table = tables.second; + auto const [start, end] = GetParam(); auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; auto expected_arrow_table = arrow_table->Slice(start, end - start); diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 574ce8573e9..656c72ef02f 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1137,8 +1137,7 @@ struct OrcWriterTestDecimal : public OrcWriterTest, TEST_P(OrcWriterTestDecimal, Decimal64) { - auto const num_rows = std::get<0>(GetParam()); - auto const scale = std::get<1>(GetParam()); + auto const [num_rows, scale] = GetParam(); // Using int16_t because scale causes values to overflow if they already require 32 bits auto const vals = random_values(num_rows); @@ -1241,9 +1240,8 @@ struct OrcWriterTestStripes TEST_P(OrcWriterTestStripes, StripeSize) { - constexpr auto num_rows = 1000000; - auto size_bytes = std::get<0>(GetParam()); - auto size_rows = std::get<1>(GetParam()); + constexpr auto num_rows = 1000000; + auto const [size_bytes, size_rows] = GetParam(); const auto seq_col = random_values(num_rows); const auto validity = diff --git a/cpp/tests/transform/mask_to_bools_test.cpp b/cpp/tests/transform/mask_to_bools_test.cpp index 2a759ffcfe5..02057fc3f3a 100644 --- a/cpp/tests/transform/mask_to_bools_test.cpp +++ b/cpp/tests/transform/mask_to_bools_test.cpp @@ -56,8 +56,7 @@ struct MaskToBoolsTest TEST_P(MaskToBoolsTest, LargeDataSizeTest) { auto data = std::vector(10000); - cudf::size_type const begin_bit = std::get<0>(GetParam()); - cudf::size_type const end_bit = std::get<1>(GetParam()); + auto const [begin_bit, end_bit] = GetParam(); std::transform(data.cbegin(), data.cend(), data.begin(), [](auto val) { return rand() % 2 == 0 ? true : false; });