diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 67c451fbd6e..6942ef0009d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -108,3 +108,21 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+  trigger-pandas-tests:
+    if: inputs.build_type == 'nightly'
+    needs: wheel-build-cudf
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.sha }}
+          persist-credentials: false
+      - name: Trigger pandas-tests
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          gh workflow run pandas-tests.yaml \
+            -f branch=${{ inputs.branch }} \
+            -f sha=${{ inputs.sha }} \
+            -f date=${{ inputs.date }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
new file mode 100644
index 00000000000..60544294809
--- /dev/null
+++ b/.github/workflows/pandas-tests.yaml
@@ -0,0 +1,27 @@
+name: Pandas Test Job
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+
+jobs:
+  pandas-tests:
+      # run the Pandas unit tests
+      secrets: inherit
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+      with:
+        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+        build_type: nightly
+        branch: ${{ inputs.branch }}
+        date: ${{ inputs.date }}
+        sha: ${{ inputs.sha }}
+        script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 345ccbea45b..f9d5976f1fe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -174,7 +174,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
@@ -182,7 +182,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@patch-1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 65aef37697e..170f45e23fd 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -44,7 +44,6 @@ jobs:
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
-    needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
@@ -125,14 +124,3 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/cudf_pandas_scripts/run_tests.sh
-  pandas-tests:
-    # run the Pandas unit tests
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
-    with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: nightly
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index dce92d7e613..757eaa44510 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,7 +105,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
index 675e0c3981f..d1f9e0d1399 100755
--- a/ci/configure_cpp_static.sh
+++ b/ci/configure_cpp_static.sh
@@ -3,8 +3,6 @@
 
 set -euo pipefail
 
-rapids-configure-conda-channels
-
 source rapids-date-string
 
 rapids-logger "Configure static cpp build"
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index cf80f383db4..f87a3a36fcc 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -8,14 +8,16 @@
 
 # Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
 GH_JOB_NAME="pandas-tests-diff / build"
+RAPIDS_FULL_VERSION=$(<./VERSION)
 rapids-logger "Github job name: ${GH_JOB_NAME}"
+rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
 PY_VER="39"
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json
+MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
 
 read -r COMPARE_ENV < s3_output.txt
 export COMPARE_ENV
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 1f70ca78c41..abde5e5d160 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -6,8 +6,8 @@
 set -euo pipefail
 
 PANDAS_TESTS_BRANCH=${1}
-
-rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
+RAPIDS_FULL_VERSION=$(<./VERSION)
+rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION"
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
@@ -27,9 +27,10 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
+SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
 # summarize the results and save them to artifacts:
-python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json
+python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
 mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
-mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
-rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
+mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/
+rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 82d7104b0da..e629f8b633e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -68,7 +68,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - ptxcompiler
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 0fd87e91745..f135a88cac2 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 7633fbb00a3..cd9237bd7cb 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }}
+    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.2dev0
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b384f6d5674..571780888c0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -339,6 +339,11 @@ ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
 target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 
+# ##################################################################################################
+# * decimal benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 9857aac4473..6df2cb44adc 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -324,10 +324,11 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   distribution_fn<DeviceType> dist;
   std::optional<numeric::scale_type> scale;
 
-  random_value_fn(distribution_params<DeviceType> const& desc)
+  random_value_fn(distribution_params<T> const& desc)
     : lower_bound{desc.lower_bound},
       upper_bound{desc.upper_bound},
-      dist{make_distribution<DeviceType>(desc.id, desc.lower_bound, desc.upper_bound)}
+      dist{make_distribution<DeviceType>(desc.id, lower_bound, upper_bound)},
+      scale{desc.scale}
   {
   }
 
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 31dc2673d70..68d3dc492f5 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -182,9 +182,17 @@ struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::struct_vi
   cudf::size_type max_depth;
 };
 
-// Present for compilation only. To be implemented once reader/writers support the fixed width type.
+/**
+ * @brief Fixed-point values are parameterized with a distribution type, scale, and bounds of the
+ * same type.
+ */
 template <typename T>
-struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {};
+struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
+  distribution_id id;
+  typename T::rep lower_bound;
+  typename T::rep upper_bound;
+  std::optional<numeric::scale_type> scale;
+};
 
 /**
  * @brief Returns a vector of types, corresponding to the input type or a type group.
@@ -226,7 +234,7 @@ class data_profile {
     cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2};
   distribution_params<cudf::struct_view> struct_dist_desc{
     {cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
-  std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
+  std::map<cudf::type_id, distribution_params<numeric::decimal128>> decimal_params;
 
   double bool_probability_true           = 0.5;
   std::optional<double> null_probability = 0.01;
@@ -300,16 +308,21 @@ class data_profile {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  distribution_params<typename T::rep> get_distribution_params() const
+  distribution_params<T> get_distribution_params() const
   {
     using rep = typename T::rep;
     auto it   = decimal_params.find(cudf::type_to_id<T>());
     if (it == decimal_params.end()) {
       auto const range = default_range<rep>();
-      return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
+      auto const scale = std::optional<numeric::scale_type>{};
+      return distribution_params<T>{
+        default_distribution_id<rep>(), range.first, range.second, scale};
     } else {
       auto& desc = it->second;
-      return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
+      return {desc.id,
+              static_cast<rep>(desc.lower_bound),
+              static_cast<rep>(desc.upper_bound),
+              desc.scale};
     }
   }
 
@@ -359,6 +372,23 @@ class data_profile {
     }
   }
 
+  // Users should pass integral values for bounds when setting the parameters for fixed-point.
+  // Otherwise the call with have no effect.
+  template <typename T,
+            typename Type_enum,
+            std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
+  void set_distribution_params(Type_enum type_or_group,
+                               distribution_id dist,
+                               T lower_bound,
+                               T upper_bound,
+                               numeric::scale_type scale)
+  {
+    for (auto tid : get_type_or_group(static_cast<int32_t>(type_or_group))) {
+      decimal_params[tid] = {
+        dist, static_cast<__int128_t>(lower_bound), static_cast<__int128_t>(upper_bound), scale};
+    }
+  }
+
   template <typename T, typename Type_enum, std::enable_if_t<cudf::is_chrono<T>(), T>* = nullptr>
   void set_distribution_params(Type_enum type_or_group,
                                distribution_id dist,
diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
new file mode 100644
index 00000000000..a367036c494
--- /dev/null
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <type_traits>
+
+// This benchmark compares the cost of converting decimal <--> floating point
+template <typename InputType, typename OutputType>
+void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, OutputType>)
+{
+  static constexpr bool is_input_floating  = std::is_floating_point_v<InputType>;
+  static constexpr bool is_output_floating = std::is_floating_point_v<OutputType>;
+
+  static constexpr bool is_double =
+    std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
+  static constexpr bool is_32bit =
+    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
+  static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
+                                    std::is_same_v<OutputType, numeric::decimal128>;
+
+  // Skip floating --> floating and decimal --> decimal
+  if constexpr (is_input_floating == is_output_floating) {
+    state.skip("Meaningless conversion.");
+    return;
+  }
+
+  // Skip float <--> dec128
+  if constexpr (!is_double && is_128bit) {
+    state.skip("Ignoring float <--> dec128.");
+    return;
+  }
+
+  // Get settings
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const exp_mode = state.get_int64("exp_range");
+
+  // Exponent range: Range size is 10^6
+  // These probe the edges of the float and double ranges, as well as more common values
+  int const exp_min_array[] = {-307, -37, -14, -3, 8, 31, 301};
+  int const exp_range_size  = 6;
+  int const exp_min         = exp_min_array[exp_mode];
+  int const exp_max         = exp_min + exp_range_size;
+
+  // With exp range size of 6, decimal output (generated or casted-to) has 7 digits of precision
+  int const extra_digits_precision = 1;
+
+  // Exclude end range of double from float test
+  if (!is_double && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Range beyond end of float tests.");
+    return;
+  }
+
+  // The current float <--> decimal conversion algorithm is limited
+  static constexpr bool is_64bit = !is_32bit && !is_128bit;
+  if (is_32bit && (exp_mode != 3)) {
+    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
+    return;
+  }
+  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
+    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
+    return;
+  }
+  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
+    return;
+  }
+
+  // Type IDs
+  auto const input_id  = cudf::type_to_id<InputType>();
+  auto const output_id = cudf::type_to_id<OutputType>();
+
+  // Create data profile and scale
+  auto const [output_scale, profile] = [&]() {
+    if constexpr (is_input_floating) {
+      // Range for generated floating point values
+      auto get_pow10 = [](auto exp10) {
+        return std::pow(static_cast<InputType>(10), static_cast<InputType>(exp10));
+      };
+      InputType const floating_range_min = get_pow10(exp_min);
+      InputType const floating_range_max = get_pow10(exp_max);
+
+      // With exp range size of 6, output has 7 decimal digits of precision
+      auto const decimal_output_scale = exp_min - extra_digits_precision;
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(
+        input_id, distribution_id::NORMAL, floating_range_min, floating_range_max);
+
+      return std::pair{decimal_output_scale, profile};
+
+    } else {  // Generating decimals
+
+      using decimal_rep_type = typename InputType::rep;
+
+      // For exp range size 6 and precision 7, generates ints between 10 and 10^7,
+      // with scale factor of: exp_max - 7. This matches floating point generation.
+      int const digits_precision     = exp_range_size + extra_digits_precision;
+      auto const decimal_input_scale = numeric::scale_type{exp_max - digits_precision};
+
+      // Range for generated integer values
+      auto get_pow10 = [](auto exp10) {
+        return numeric::detail::ipow<decimal_rep_type, numeric::Radix::BASE_10>(exp10);
+      };
+      auto const decimal_range_min = get_pow10(digits_precision - exp_range_size);
+      auto const decimal_range_max = get_pow10(digits_precision);
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(input_id,
+                                                                       distribution_id::NORMAL,
+                                                                       decimal_range_min,
+                                                                       decimal_range_max,
+                                                                       decimal_input_scale);
+
+      return std::pair{0, profile};
+    }
+  }();
+
+  // Generate input data
+  auto const input_col  = create_random_column(input_id, row_count{num_rows}, profile);
+  auto const input_view = input_col->view();
+
+  // Output type
+  auto const output_type =
+    !is_input_floating ? cudf::data_type(output_id) : cudf::data_type(output_id, output_scale);
+
+  // Stream
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  // Run benchmark
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::cast(input_view, output_type); });
+
+  // Throughput statistics
+  state.add_element_count(num_rows);
+  state.add_global_memory_reads<InputType>(num_rows);
+  state.add_global_memory_writes<OutputType>(num_rows);
+}
+
+// Data types
+using data_types =
+  nvbench::type_list<float, double, numeric::decimal32, numeric::decimal64, numeric::decimal128>;
+
+NVBENCH_BENCH_TYPES(bench_cast_decimal, NVBENCH_TYPE_AXES(data_types, data_types))
+  .set_name("decimal_floating_conversion")
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("exp_range", nvbench::range(0, 6));
diff --git a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
index ab934750f9e..bbf56cf1446 100644
--- a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
+++ b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,6 +102,9 @@ __device__ inline double stod(string_view const& d_str)
           ch = *in_ptr++;
           if (ch < '0' || ch > '9') break;
           exp_ten = (exp_ten * 10) + (int)(ch - '0');
+          // Prevent integer overflow in exp_ten. 100,000,000 is the largest
+          // power of ten that can be multiplied by 10 without overflow.
+          if (exp_ten >= 100'000'000) { break; }
         }
       }
     }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index bc5c45d8980..9d40c657396 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -76,16 +76,16 @@ void print_tree(host_span<SymbolT const> input,
                 tree_meta_t const& d_gpu_tree,
                 rmm::cuda_stream_view stream)
 {
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
             "node_categories",
             to_cat);
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
             "parent_node_ids",
             to_int);
   print_vec(
-    cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int);
-  auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream);
-  auto node_range_end   = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream);
+    cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
+  auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
+  auto node_range_end   = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
   print_vec(node_range_begin, "node_range_begin", to_int);
   print_vec(node_range_end, "node_range_end", to_int);
   for (int i = 0; i < int(node_range_begin.size()); i++) {
@@ -333,10 +333,11 @@ rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_a
  * @param stream CUDA stream
  * @return Vector of strings
  */
-std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
-                                              device_span<SymbolOffsetT const> node_range_begin,
-                                              device_span<SymbolOffsetT const> node_range_end,
-                                              rmm::cuda_stream_view stream)
+std::vector<std::string> copy_strings_to_host_sync(
+  device_span<SymbolT const> input,
+  device_span<SymbolOffsetT const> node_range_begin,
+  device_span<SymbolOffsetT const> node_range_end,
+  rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
@@ -371,12 +372,13 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_std_vector_sync<char>(
+    auto const h_chars = cudf::detail::make_std_vector_async<char>(
       cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_std_vector_sync(
+    auto const h_offsets = cudf::detail::make_std_vector_async(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
       stream);
+    stream.synchronize();
 
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
@@ -528,8 +530,9 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto column_range_beg =
     cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
   auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
-  std::vector<std::string> column_names = copy_strings_to_host(
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  stream.synchronize();
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -537,6 +540,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
     auto h_values_column_indices =
       cudf::detail::make_std_vector_async(values_column_indices, stream);
+    stream.synchronize();
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
                    column_names.begin(),
@@ -609,7 +613,7 @@ void make_device_json_column(device_span<SymbolT const> input,
 
   std::vector<uint8_t> is_str_column_all_nulls{};
   if (is_enabled_mixed_types_as_string) {
-    is_str_column_all_nulls = cudf::detail::make_std_vector_async(
+    is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
       is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
   }
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5a8d96975ce..fd8d4f8bd7f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1396,16 +1396,13 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
 }
 
 /**
- * @brief Encode a batch of pages.
+ * @brief Encode pages.
  *
  * @throws rmm::bad_alloc if there is insufficient space for temporary buffers
  *
  * @param chunks column chunk array
  * @param pages encoder pages array
- * @param pages_in_batch number of pages in this batch
- * @param first_page_in_batch first page in batch
- * @param rowgroups_in_batch number of rowgroups in this batch
- * @param first_rowgroup first rowgroup in batch
+ * @param num_rowgroups number of rowgroups
  * @param page_stats optional page-level statistics (nullptr if none)
  * @param chunk_stats optional chunk-level statistics (nullptr if none)
  * @param column_stats optional page-level statistics for column index (nullptr if none)
@@ -1417,10 +1414,6 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
  */
 void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   device_span<EncPage> pages,
-                  uint32_t pages_in_batch,
-                  uint32_t first_page_in_batch,
-                  uint32_t rowgroups_in_batch,
-                  uint32_t first_rowgroup,
                   statistics_chunk const* page_stats,
                   statistics_chunk const* chunk_stats,
                   statistics_chunk const* column_stats,
@@ -1430,14 +1423,12 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   bool write_v2_headers,
                   rmm::cuda_stream_view stream)
 {
-  auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
+  auto const num_pages = pages.size();
+  auto pages_stats     = (page_stats != nullptr)
+                           ? device_span<statistics_chunk const>(page_stats, num_pages)
+                           : device_span<statistics_chunk const>();
 
-  auto batch_pages_stats =
-    (page_stats != nullptr)
-      ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
-      : device_span<statistics_chunk const>();
-
-  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1447,7 +1438,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
     case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
@@ -1480,25 +1471,23 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
 
-  auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
-  DecideCompression(d_chunks_in_batch.flat_view(), stream);
-  EncodePageHeaders(batch_pages, comp_res, batch_pages_stats, chunk_stats, stream);
-  GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
+  auto d_chunks = chunks.device_view();
+  DecideCompression(d_chunks.flat_view(), stream);
+  EncodePageHeaders(pages, comp_res, pages_stats, chunk_stats, stream);
+  GatherPages(d_chunks.flat_view(), pages, stream);
 
   // By now, the var_bytes has been calculated in InitPages, and the histograms in EncodePages.
   // EncodeColumnIndexes can encode the histograms in the ColumnIndex, and also sum up var_bytes
   // and the histograms for inclusion in the chunk's SizeStats.
   if (column_stats != nullptr) {
-    EncodeColumnIndexes(d_chunks_in_batch.flat_view(),
-                        {column_stats, pages.size()},
-                        column_index_truncate_length,
-                        stream);
+    EncodeColumnIndexes(
+      d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
-                                d_chunks_in_batch.data(),
-                                d_chunks_in_batch.flat_view().size_bytes(),
+  auto h_chunks = chunks.host_view();
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
+                                d_chunks.data(),
+                                d_chunks.flat_view().size_bytes(),
                                 cudaMemcpyDefault,
                                 stream.value()));
 
@@ -1959,33 +1948,23 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p);
   }
 
-  // Batch processing is no longer supported.
-  // This line disables batch processing (so batch size will no longer be limited at 1GB as before).
-  // TODO: All the relevant code will be removed in the follow-up work:
-  // https://github.com/rapidsai/cudf/issues/13440
-  auto const max_bytes_in_batch = std::numeric_limits<size_t>::max();
-
-  // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
-  std::vector<size_type> batch_list;
-  size_type num_pages           = 0;
-  size_t max_uncomp_bfr_size    = 0;
-  size_t max_comp_bfr_size      = 0;
-  size_t max_chunk_bfr_size     = 0;
-  size_type max_pages_in_batch  = 0;
-  size_t bytes_in_batch         = 0;
-  size_t comp_bytes_in_batch    = 0;
+  // Initialize rowgroups to encode
+  size_type num_pages        = 0;
+  size_t max_uncomp_bfr_size = 0;
+  size_t max_comp_bfr_size   = 0;
+  size_t max_chunk_bfr_size  = 0;
+
   size_t column_index_bfr_size  = 0;
   size_t def_histogram_bfr_size = 0;
   size_t rep_histogram_bfr_size = 0;
-  for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
-    size_t rowgroup_size      = 0;
-    size_t comp_rowgroup_size = 0;
+  size_t rowgroup_size          = 0;
+  size_t comp_rowgroup_size     = 0;
+  for (size_type r = 0; r <= num_rowgroups; r++) {
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
         EncColumnChunk* ck = &chunks[r][i];
         ck->first_page     = num_pages;
         num_pages += ck->num_pages;
-        pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
         comp_rowgroup_size += ck->compressed_size;
         max_chunk_bfr_size =
@@ -2007,29 +1986,17 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-    // TBD: We may want to also shorten the batch if we have enough pages (not just based on size)
-    if ((r == num_rowgroups) ||
-        (groups_in_batch != 0 && bytes_in_batch + rowgroup_size > max_bytes_in_batch)) {
-      max_uncomp_bfr_size = std::max(max_uncomp_bfr_size, bytes_in_batch);
-      max_comp_bfr_size   = std::max(max_comp_bfr_size, comp_bytes_in_batch);
-      max_pages_in_batch  = std::max(max_pages_in_batch, pages_in_batch);
-      if (groups_in_batch != 0) {
-        batch_list.push_back(groups_in_batch);
-        groups_in_batch = 0;
-      }
-      bytes_in_batch      = 0;
-      comp_bytes_in_batch = 0;
-      pages_in_batch      = 0;
+    // write bfr sizes if this is the last rowgroup
+    if (r == num_rowgroups) {
+      max_uncomp_bfr_size = rowgroup_size;
+      max_comp_bfr_size   = comp_rowgroup_size;
     }
-    bytes_in_batch += rowgroup_size;
-    comp_bytes_in_batch += comp_rowgroup_size;
-    groups_in_batch++;
   }
 
   // Clear compressed buffer size if compression has been turned off
   if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
-  // Initialize data pointers in batch
+  // Initialize data pointers
   uint32_t const num_stats_bfr =
     (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
 
@@ -2055,10 +2022,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   auto bfr_i = static_cast<uint8_t*>(col_idx_bfr.data());
   auto bfr_r = rep_level_histogram.data();
   auto bfr_d = def_level_histogram.data();
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
+  if (num_rowgroups != 0) {
     auto bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
-    for (auto j = 0; j < batch_list[b]; j++, r++) {
+    for (auto r = 0; r < num_rowgroups; r++) {
       for (auto i = 0; i < num_columns; i++) {
         EncColumnChunk& ck   = chunks[r][i];
         ck.uncompressed_bfr  = bfr;
@@ -2108,22 +2075,11 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::optional<writer_compression_statistics> comp_stats;
   if (collect_compression_statistics) { comp_stats = writer_compression_statistics{}; }
 
-  // Encode row groups in batches
-  for (auto b = 0, batch_r_start = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    // Count pages in this batch
-    auto const rnext               = batch_r_start + batch_list[b];
-    auto const first_page_in_batch = chunks[batch_r_start][0].first_page;
-    auto const first_page_in_next_batch =
-      (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
-    auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch;
-
+  // Encode row groups
+  if (num_rowgroups != 0) {
     encode_pages(
       chunks,
       {pages.data(), pages.size()},
-      pages_in_batch,
-      first_page_in_batch,
-      batch_list[b],
-      batch_r_start,
       (stats_granularity == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr,
       (stats_granularity != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                               : nullptr,
@@ -2152,7 +2108,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       }
     }
 
-    for (int r = batch_r_start; r < rnext; r++) {
+    for (int r = 0; r < num_rowgroups; r++) {
       int p           = rg_to_part[r];
       int global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group = agg_meta->file(p).row_groups[global_r];
@@ -2192,7 +2148,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       auto h_def_ptr = h_def_histogram.data();
       auto h_rep_ptr = h_rep_histogram.data();
 
-      for (int r = batch_r_start; r < rnext; r++) {
+      for (int r = 0; r < num_rowgroups; r++) {
         int const p        = rg_to_part[r];
         int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto& row_group    = agg_meta->file(p).row_groups[global_r];
@@ -2239,8 +2195,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-
-    batch_r_start = rnext;
   }
 
   auto bounce_buffer =
@@ -2251,7 +2205,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                     std::move(chunks),
                     std::move(global_rowgroup_base),
                     std::move(first_rg_in_part),
-                    std::move(batch_list),
                     std::move(rg_to_part),
                     std::move(comp_stats),
                     std::move(uncomp_bfr),
@@ -2358,7 +2311,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                          chunks,
                          global_rowgroup_base,
                          first_rg_in_part,
-                         batch_list,
                          rg_to_part,
                          comp_stats,
                          uncomp_bfr,   // unused, but contains data for later write to sink
@@ -2402,7 +2354,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                              chunks,
                              global_rowgroup_base,
                              first_rg_in_part,
-                             batch_list,
                              rg_to_part,
                              bounce_buffer);
 
@@ -2417,18 +2368,17 @@ void writer::impl::write_parquet_data_to_sink(
   host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
-  host_span<size_type const> batch_list,
   host_span<int const> rg_to_part,
   host_span<uint8_t> bounce_buffer)
 {
-  _agg_meta              = std::move(updated_agg_meta);
-  auto const num_columns = chunks.size().second;
+  _agg_meta                = std::move(updated_agg_meta);
+  auto const num_rowgroups = chunks.size().first;
+  auto const num_columns   = chunks.size().second;
 
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    auto const rnext = r + batch_list[b];
+  if (num_rowgroups != 0) {
     std::vector<std::future<void>> write_tasks;
 
-    for (; r < rnext; r++) {
+    for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
       int const p        = rg_to_part[r];
       int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group    = _agg_meta->file(p).row_groups[global_r];
@@ -2472,10 +2422,9 @@ void writer::impl::write_parquet_data_to_sink(
     auto const h_pages = cudf::detail::make_host_vector_sync(pages, _stream);
 
     // add column and offset indexes to metadata
-    for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-      auto const rnext   = r + batch_list[b];
-      auto curr_page_idx = chunks[r][0].first_page;
-      for (; r < rnext; r++) {
+    if (num_rowgroups != 0) {
+      auto curr_page_idx = chunks[0][0].first_page;
+      for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
         int const p           = rg_to_part[r];
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 2f6608b0ae7..3cbb7630fab 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -129,7 +129,6 @@ class writer::impl {
    * @param chunks Column chunks
    * @param global_rowgroup_base Numbers of rowgroups in each file/partition
    * @param first_rg_in_part The first rowgroup in each partition
-   * @param batch_list The batches of rowgroups to encode
    * @param rg_to_part A map from rowgroup to partition
    * @param[out] bounce_buffer Temporary host output buffer
    */
@@ -138,7 +137,6 @@ class writer::impl {
                                   host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
-                                  host_span<size_type const> batch_list,
                                   host_span<int const> rg_to_part,
                                   host_span<uint8_t> bounce_buffer);
 
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 8d8930013cf..a7fd244f8a5 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
@@ -34,6 +35,9 @@
 
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <thrust/for_each.h>
+#include <thrust/merge.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
@@ -110,23 +114,22 @@ struct convert_char_fn {
  *
  * This can be used in calls to make_strings_children.
  */
-struct upper_lower_fn {
+struct base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
   size_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void operator()(size_type idx) const
+  base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}
+
+  __device__ inline void process_string(string_view d_str, size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str = d_strings.element<string_view>(idx);
-    size_type bytes  = 0;
-    char* d_buffer   = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto const size = converter.process_character(*itr, d_buffer);
+    size_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    for (auto itr = d_str.data(); itr < (d_str.data() + d_str.size_bytes()); ++itr) {
+      if (is_utf8_continuation_char(static_cast<u_char>(*itr))) continue;
+      char_utf8 chr = 0;
+      to_char_utf8(itr, chr);
+      auto const size = converter.process_character(chr, d_buffer);
       if (d_buffer) {
         d_buffer += size;
       } else {
@@ -137,45 +140,116 @@ struct upper_lower_fn {
   }
 };
 
+struct upper_lower_fn : public base_upper_lower_fn {
+  column_device_view d_strings;
+
+  upper_lower_fn(convert_char_fn converter, column_device_view const& d_strings)
+    : base_upper_lower_fn{converter}, d_strings{d_strings}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
+    }
+    auto const d_str = d_strings.element<string_view>(idx);
+    process_string(d_str, idx);
+  }
+};
+
+// Long strings are divided into smaller strings using this value as a guide.
+// Generally strings are split into sub-blocks of bytes of this size but
+// care is taken to not sub-block in the middle of a multi-byte character.
+constexpr size_type LS_SUB_BLOCK_SIZE = 32;
+
 /**
- * @brief Count output bytes in warp-parallel threads
+ * @brief Produces sub-offsets for the chars in the given strings column
+ */
+struct sub_offset_fn {
+  char const* d_input_chars;
+  int64_t first_offset;
+  int64_t last_offset;
+
+  __device__ int64_t operator()(int64_t idx) const
+  {
+    auto const end = d_input_chars + last_offset;
+    auto position  = (idx + 1) * LS_SUB_BLOCK_SIZE;
+    auto begin     = d_input_chars + first_offset + position;
+    while ((begin < end) && is_utf8_continuation_char(static_cast<u_char>(*begin))) {
+      ++begin;
+      ++position;
+    }
+    return (begin < end) ? position + first_offset : last_offset;
+  }
+};
+
+/**
+ * @brief Specialized case conversion for long strings
  *
- * This executes as one warp per string and just computes the output sizes.
+ * This is needed since the offset count can exceed size_type.
+ * Also, nulls are ignored since this purely builds the output chars.
+ * The d_offsets are only temporary to help address the sub-blocks.
  */
-struct count_bytes_fn {
+struct upper_lower_ls_fn : public base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
-  size_type* d_offsets;
+  char const* d_input_chars;
+  int64_t* d_input_offsets;  // includes column offset
 
+  upper_lower_ls_fn(convert_char_fn converter, char const* d_input_chars, int64_t* d_input_offsets)
+    : base_upper_lower_fn{converter}, d_input_chars{d_input_chars}, d_input_offsets{d_input_offsets}
+  {
+  }
+
+  // idx is row index
   __device__ void operator()(size_type idx) const
   {
-    auto const str_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-
-    // initialize the output for the atomicAdd
-    if (lane_idx == 0) { d_offsets[str_idx] = 0; }
-    __syncwarp();
-
-    if (d_strings.is_null(str_idx)) { return; }
-    auto const d_str   = d_strings.element<string_view>(str_idx);
-    auto const str_ptr = d_str.data();
-
-    size_type size = 0;
-    for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
-      auto const chr = str_ptr[i];
-      if (is_utf8_continuation_char(chr)) { continue; }
-      char_utf8 u8 = 0;
-      to_char_utf8(str_ptr + i, u8);
-      size += converter.process_character(u8);
-    }
-    // this is every so slightly faster than using the cub::warp_reduce
-    if (size > 0) {
-      cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_offsets + str_idx)};
-      ref.fetch_add(size, cuda::std::memory_order_relaxed);
-    }
+    auto const offset = d_input_offsets[idx];
+    auto const d_str  = string_view{d_input_chars + offset,
+                                   static_cast<size_type>(d_input_offsets[idx + 1] - offset)};
+    process_string(d_str, idx);
   }
 };
 
+/**
+ * @brief Count output bytes in warp-parallel threads
+ *
+ * This executes as one warp per string and just computes the output sizes.
+ */
+CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
+                                    column_device_view d_strings,
+                                    size_type* d_sizes)
+{
+  auto idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
+
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  // initialize the output for the atomicAdd
+  if (lane_idx == 0) { d_sizes[str_idx] = 0; }
+  __syncwarp();
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str   = d_strings.element<string_view>(str_idx);
+  auto const str_ptr = d_str.data();
+
+  size_type size = 0;
+  for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
+    auto const chr = str_ptr[i];
+    if (is_utf8_continuation_char(chr)) { continue; }
+    char_utf8 u8 = 0;
+    to_char_utf8(str_ptr + i, u8);
+    size += converter.process_character(u8);
+  }
+  // this is slightly faster than using the cub::warp_reduce
+  if (size > 0) {
+    cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_sizes + str_idx)};
+    ref.fetch_add(size, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Special functor for processing ASCII-only data
  */
@@ -208,11 +282,18 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   auto const d_cases   = get_character_cases_table();
   auto const d_special = get_special_case_mapping_table();
 
+  auto const first_offset = (input.offset() == 0) ? 0L
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size = last_offset - first_offset;
+
   convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
   upper_lower_fn converter{ccfn, *d_strings};
 
   // For smaller strings, use the regular string-parallel algorithm
-  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+  if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
     auto [offsets, chars] =
       cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
@@ -235,9 +316,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                        [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
   if (!multi_byte_chars) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
-    auto result           = std::make_unique<column>(input.parent(), stream, mr);
-    auto d_chars          = result->mutable_view().head<char>();
-    auto const chars_size = strings_column_view(result->view()).chars_size(stream);
+    auto result  = std::make_unique<column>(input.parent(), stream, mr);
+    auto d_chars = result->mutable_view().head<char>();
     thrust::transform(
       rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn});
     result->set_null_count(input.null_count());
@@ -245,30 +325,46 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   }
 
   // This will use a warp-parallel algorithm to compute the output sizes for each string
-  // and then uses the normal string parallel functor to build the output.
-  auto offsets = make_numeric_column(
-    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<size_type>();
-
-  // first pass, compute output sizes
   // note: tried to use segmented-reduce approach instead here and it was consistently slower
-  count_bytes_fn counter{ccfn, *d_strings, d_offsets};
-  auto const count_itr = thrust::make_counting_iterator<size_type>(0);
-  thrust::for_each_n(
-    rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter);
-
-  // convert sizes to offsets
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
-
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  // second pass, write output
-  converter.d_offsets = d_offsets;
-  converter.d_chars   = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
+  auto [offsets, bytes] = [&] {
+    rmm::device_uvector<size_type> sizes(input.size(), stream);
+    constexpr int block_size = 512;
+    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+    count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      ccfn, *d_strings, sizes.data());
+    // convert sizes to offsets
+    return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
+  }();
+
+  // build sub-offsets
+  auto const input_chars = input.chars_begin(stream);
+  auto const sub_count   = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets       = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  {
+    rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<size_type>(0);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      count_itr,
+                      count_itr + sub_count,
+                      sub_offsets.data(),
+                      sub_offset_fn{input_chars, first_offset, last_offset});
+
+    // merge them with input offsets
+    auto input_offsets =
+      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  input_offsets,
+                  input_offsets + input.size() + 1,
+                  sub_offsets.begin(),
+                  sub_offsets.end(),
+                  tmp_offsets.begin());
+  }
+
+  // run case conversion over the new sub-strings
+  auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
+  upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
+  auto chars =
+    std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index 1d82d785ae8..bb0e77a29d0 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -235,7 +235,7 @@ TEST_F(StringsCaseTest, LongStrings)
 {
   // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu
   cudf::test::strings_column_wrapper input{
-    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdéfghijklmnopqrstuvwxyzABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
@@ -256,7 +256,8 @@ TEST_F(StringsCaseTest, LongStrings)
   results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front()));
+  view    = cudf::strings_column_view(cudf::slice(input, {1, 3}).front());
+  results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front());
 }
 
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index f668c384787..9fa1a3325b4 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -25,8 +26,6 @@
 
 #include <vector>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConvertTest, IsFloat)
@@ -89,7 +88,7 @@ TEST_F(StringsConvertTest, ToFloats32)
     h_expected.begin(),
     h_expected.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats32)
@@ -118,38 +117,41 @@ TEST_F(StringsConvertTest, FromFloats32)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
   // clang-format off
   std::vector<const char*> h_strings{
-    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "1234",   "",         "-876",     "543.2",         "-0.12",   ".25",
     "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
     "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
     "-Inf",   "-INFINITY", "1.0",     "1.7976931348623157e+308",  "1.7976931348623157e-307",
     // subnormal numbers:           v--- smallest double               v--- result is 0
-    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324" };
+    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324",
+    // another very small number
+    "9.299999257686047e-0005603333574677677" };
   // clang-format on
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity = cudf::test::iterators::null_at(1);
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   std::vector<double> h_expected;
   std::for_each(h_strings.begin(), h_strings.end(), [&](char const* str) {
-    h_expected.push_back(str ? std::atof(str) : 0);
+    h_expected.push_back(std::atof(str));
   });
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT64});
 
   cudf::test::fixed_width_column_wrapper<double> expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+    h_expected.begin(), h_expected.end(), validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  results = cudf::strings::is_float(strings_view);
+  cudf::test::fixed_width_column_wrapper<bool> is_expected(
+    {1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, is_expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats64)
@@ -178,7 +180,7 @@ TEST_F(StringsConvertTest, FromFloats64)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat)
diff --git a/dependencies.yaml b/dependencies.yaml
index 5bb555df818..8cd4c798c38 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -286,7 +286,7 @@ dependencies:
       - output_types: conda
         packages:
           - &rmm_conda rmm==24.6.*
-          - &protobuf protobuf>=4.21,<5
+          - &protobuf protobuf>=3.20,<5
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
index 549d91b771a..4aaaa8a93df 100644
--- a/docs/cudf/source/user_guide/pandas-comparison.md
+++ b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`.
 
 ## Result ordering
 
-By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
-do *not* guarantee output ordering.
-Compare the results obtained from Pandas and cuDF below:
+In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide
+certain guarantees about the order of rows in the result returned.  In a Pandas
+`join`, the order of join keys is (depending on the particular style of join
+being performed) either preserved or sorted lexicographically by default.
+`groupby` sorts the group keys, and preserves the order of rows within each
+group. In some cases, disabling this option in Pandas can yield better
+performance.
+
+By contrast, cuDF's default behavior is to return rows in a
+non-deterministic order to maximize performance.  Compare the results
+obtained from Pandas and cuDF below:
 
 ```{code} python
 >>> import cupy as cp
@@ -114,13 +122,16 @@ a
 4  342.000000
 ```
 
-To match Pandas behavior, you must explicitly pass `sort=True`
-or enable the `mode.pandas_compatible` option when trying to
-match Pandas behavior with `sort=False`:
+In most cases, the rows of a DataFrame are accessed by index labels
+rather than by position, so the order in which rows are returned
+doesn't matter. However, if you require that results be returned in a
+predictable (sorted) order, you can pass the `sort=True` option
+explicitly or enable the `mode.pandas_compatible` option when trying
+to match Pandas behavior with `sort=False`:
 
 ```{code} python
->>> df.to_pandas().groupby("a", sort=True).mean().head()
-            b
+>>> df.groupby("a", sort=True).mean().head()
+         b
 a
 0   70.000000
 1  356.333333
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 81d15cf95b4..c2b7cb7ca3d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -44,3 +44,5 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
 link_to_pyarrow_headers(pylibcudf_interop)
+
+add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 48c23a9dd4c..5adefa5fd93 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@ __all__ = [
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 8ccb0ecc341..89f874f5fa5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -17,6 +17,7 @@
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
new file mode 100644
index 00000000000..3a2a9e1e7eb
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources case.pyx)
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
new file mode 100644
index 00000000000..ff87549b5b5
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import case
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
new file mode 100644
index 00000000000..ff87549b5b5
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import case
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
new file mode 100644
index 00000000000..225d566fe06
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+
+
+cpdef Column to_lower(Column input)
+cpdef Column to_upper(Column input)
+cpdef Column swapcase(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
new file mode 100644
index 00000000000..69910fd8c50
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.strings cimport case as cpp_case
+from cudf._lib.pylibcudf.column cimport Column
+
+
+cpdef Column to_lower(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_lower(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column to_upper(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_upper(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column swapcase(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.swapcase(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx
index 09af1178946..38f242a67d6 100644
--- a/python/cudf/cudf/_lib/strings/case.pyx
+++ b/python/cudf/cudf/_lib/strings/case.pyx
@@ -1,48 +1,34 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.case cimport (
-    swapcase as cpp_swapcase,
-    to_lower as cpp_to_lower,
-    to_upper as cpp_to_upper,
-)
+
+from cudf._lib.pylibcudf.strings import case
 
 
 @acquire_spill_lock()
 def to_upper(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_upper(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_upper(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def to_lower(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_lower(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_lower(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def swapcase(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_swapcase(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.swapcase(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e4620ee5bc4..e3e73035046 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -614,12 +614,6 @@ def children(self) -> Tuple[NumericalColumn]:
     def categories(self) -> ColumnBase:
         return self.dtype.categories._values
 
-    @categories.setter
-    def categories(self, value):
-        self._dtype = CategoricalDtype(
-            categories=value, ordered=self.dtype.ordered
-        )
-
     @property
     def codes(self) -> NumericalColumn:
         if self._codes is None:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 809bdb4e6d1..01842b5f0a9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1808,12 +1808,6 @@ def all(self, axis=0, skipna=True, **kwargs):
         b    False
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.all, Series.all**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.all, Series.all**
 
@@ -1867,12 +1861,6 @@ def any(self, axis=0, skipna=True, **kwargs):
         b    True
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.any, Series.any**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.any, Series.any**
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 945e546af1a..dd4924676f3 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1767,13 +1767,23 @@ def transform(self, function):
         --------
         agg
         """
+        if not (isinstance(function, str) or callable(function)):
+            raise TypeError(
+                "Aggregation must be a named aggregation or a callable"
+            )
         try:
             result = self.agg(function)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
-
+        # If the aggregation is a scan, don't broadcast
+        if libgroupby._is_all_scan_aggregate([[function]]):
+            if len(result) != len(self.obj):
+                raise AssertionError(
+                    "Unexpected result length for scan transform"
+                )
+            return result
         return self._broadcast(result)
 
     def rolling(self, *args, **kwargs):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ca9d5590044..c412b7a7e47 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1530,11 +1530,6 @@ def median(
         >>> ser.median()
         17.0
 
-        .. pandas-compat::
-            **DataFrame.median, Series.median**
-
-            Parameters currently not supported are `level` and `numeric_only`.
-
         .. pandas-compat::
             **DataFrame.median, Series.median**
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 19dde2e51b9..829790007c9 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -103,11 +103,6 @@ def _num_columns(self):
     def _column(self):
         return self._data[self.name]
 
-    @_column.setter  # type: ignore
-    @_cudf_nvtx_annotate
-    def _column(self, value):
-        self._data[self.name] = value
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def values(self):  # noqa: D102
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index b2f3fd09146..5ef25a99590 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -38,25 +38,6 @@ def read_json(
             f"or a bool, or None. Got {type(dtype)}"
         )
 
-    if engine == "cudf_experimental":
-        raise ValueError(
-            "engine='cudf_experimental' support has been removed, "
-            "use `engine='cudf'`"
-        )
-
-    if engine == "cudf_legacy":
-        # TODO: Deprecated in 23.02, please
-        # give some time until(more than couple of
-        # releases from now) `cudf_legacy`
-        # support can be removed completely.
-        warnings.warn(
-            "engine='cudf_legacy' is a deprecated engine."
-            "This will be removed in a future release."
-            "Please switch to using engine='cudf'.",
-            FutureWarning,
-        )
-    if engine == "cudf_legacy" and not lines:
-        raise ValueError(f"{engine} engine only supports JSON Lines format")
     if engine == "auto":
         engine = "cudf" if lines else "pandas"
     if engine != "cudf" and keep_quotes:
@@ -64,7 +45,7 @@ def read_json(
             "keep_quotes='True' is supported only with engine='cudf'"
         )
 
-    if engine == "cudf_legacy" or engine == "cudf":
+    if engine == "cudf":
         if dtype is None:
             dtype = True
 
@@ -117,7 +98,7 @@ def read_json(
             lines,
             compression,
             byte_range,
-            engine == "cudf_legacy",
+            False,
             keep_quotes,
             mixed_types_as_string,
         )
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index b7c8e92e8db..3c82d571939 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -174,7 +174,7 @@ def _DataFrame__dir__(self):
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__iter__": custom_iter,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "_constructor": _FastSlowAttribute("_constructor"),
@@ -208,7 +208,7 @@ def Index__new__(cls, *args, **kwargs):
         "__array_function__": array_function_method,
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "__iter__": custom_iter,
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 2f6c4ac5b13..e21c4572e44 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -130,10 +130,15 @@ and not test_s3_roundtrip_for_dir[partition_col0] \
 and not test_s3_roundtrip_for_dir[partition_col1] \
 and not test_s3_roundtrip"
 
+TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
+and not test_large_string_pyarrow \
+and not test_interchange_from_corrected_buffer_dtypes \
+and not test_eof_states"
+
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER" \
+    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
new file mode 100644
index 00000000000..ae01d953df5
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def string_col():
+    return pa.array(
+        ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
+    )
+
+
+def test_to_upper(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_upper(plc_col)
+    expected = pa.compute.utf8_upper(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_to_lower(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_lower(plc_col)
+    expected = pa.compute.utf8_lower(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_swapcase(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.swapcase(plc_col)
+    expected = pa.compute.utf8_swapcase(string_col)
+    assert_column_eq(got, expected)
diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py
new file mode 100644
index 00000000000..78d7fbfd879
--- /dev/null
+++ b/python/cudf/cudf/tests/groupby/test_transform.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import itertools
+
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"])
+def keys_null(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-values", "null-values"])
+def values_null(request):
+    return request.param
+
+
+@pytest.fixture
+def df(keys_null, values_null):
+    keys = ["a", "b", "a", "c", "b", "b", "c", "a"]
+    r = range(len(keys))
+    if keys_null:
+        keys[::3] = itertools.repeat(None, len(r[::3]))
+    values = list(range(len(keys)))
+    if values_null:
+        values[1::3] = itertools.repeat(None, len(r[1::3]))
+    return cudf.DataFrame({"key": keys, "values": values})
+
+
+@pytest.mark.parametrize("agg", ["cumsum", "cumprod", "max", "sum", "prod"])
+def test_transform_broadcast(agg, df):
+    pf = df.to_pandas()
+    got = df.groupby("key").transform(agg)
+    expect = pf.groupby("key").transform(agg)
+    assert_eq(got, expect, check_dtype=False)
+
+
+def test_transform_invalid():
+    df = cudf.DataFrame({"key": [1, 1], "values": [4, 5]})
+    with pytest.raises(TypeError):
+        df.groupby("key").transform({"values": "cumprod"})
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 40935733f34..3033a3e75e3 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -495,9 +495,6 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
 def test_json_engine_selection():
     json = "[1, 2, 3]"
 
@@ -519,10 +516,6 @@ def test_json_engine_selection():
     for col_name in df.columns:
         assert isinstance(col_name, int)
 
-    # should raise an exception
-    with pytest.raises(ValueError):
-        cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy")
-
 
 def test_json_bool_values():
     buffer = "[true,1]\n[false,false]\n[true,true]"
@@ -541,30 +534,6 @@ def test_json_bool_values():
     np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
 
 
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
-@pytest.mark.parametrize(
-    "buffer",
-    [
-        "[1.0,]\n[null, ]",
-        '{"0":1.0,"1":}\n{"0":null,"1": }',
-        '{ "0" : 1.0 , "1" : }\n{ "0" : null , "1" : }',
-        '{"0":1.0}\n{"1":}',
-    ],
-)
-def test_json_null_literal(buffer):
-    df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy")
-
-    # first column contains a null field, type should be set to float
-    # second column contains only empty fields, type should be set to int8
-    np.testing.assert_array_equal(df.dtypes, ["float64", "int8"])
-    np.testing.assert_array_equal(
-        df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]
-    )
-    np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0])
-
-
 def test_json_bad_protocol_string():
     test_string = StringIO('{"field": "s3://path"}')
 
@@ -739,14 +708,8 @@ def test_default_integer_bitwidth(default_integer_bitwidth, engine):
 @pytest.mark.parametrize(
     "engine",
     [
-        pytest.param(
-            "cudf_legacy",
-            marks=pytest.mark.skip(
-                reason="cannot partially set dtypes for cudf json engine"
-            ),
-        ),
-        "pandas",
         "cudf",
+        "pandas",
     ],
 )
 def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine):
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0a0ee4f592c..8c58f2b859e 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -543,7 +543,7 @@
     function or `StringIO`). Multiple inputs may be provided as a list. If a
     list is specified each list entry may be of a different input type as long
     as each input is of a valid type and all input JSON schema(s) match.
-engine : {{ 'auto', 'cudf', 'cudf_legacy', 'pandas' }}, default 'auto'
+engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto'
     Parser engine to use. If 'auto' is passed, the engine will be
     automatically selected based on the other parameters. See notes below.
 orient : string
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 003a92988de..434383bc208 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.2dev0",
-    "protobuf>=4.21,<5",
+    "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",