From 9618a81cdcb66ef4c5c379887ef959ed084b7826 Mon Sep 17 00:00:00 2001
From: Dillon Cullinan <dcullinan92@gmail.com>
Date: Mon, 8 Mar 2021 11:07:15 -0500
Subject: [PATCH 1/8] FIX Remove random build directory generation for ccache
 (#7508)

This changes the root directory of the build folder for conda. Instead of generating a random build folder name, it will create a consistent build folder name at the `croot` location. This folder name is unique in CI, as every build has a unique `${WORKSPACE}` that is used.

Lots of workarounds added to properly work with Project Flash. Several `mv` commands are added to put build artifacts in a folder Project Flash expects them to be in.

Authors:
  - Dillon Cullinan (@dillon-cullinan)

Approvers:
  - AJ Schmidt (@ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/7508
---
 ci/cpu/build.sh  | 22 ++++++++++++++--------
 ci/cpu/upload.sh | 12 ++++++------
 ci/gpu/build.sh  |  7 ++++---
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index c9c2a37a4e9..588debc40db 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -19,8 +19,9 @@ export CUDA_REL=${CUDA_VERSION%.*}
 export GPUCI_CONDA_RETRY_MAX=1
 export GPUCI_CONDA_RETRY_SLEEP=30
 
-# Use Ninja to build
+# Use Ninja to build, setup Conda Build Dir
 export CMAKE_GENERATOR="Ninja"
+export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
 
 # Switch to project root; also root of repo checkout
 cd $WORKSPACE
@@ -63,29 +64,34 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
   CONDA_CHANNEL=""
 else
   CONDA_BUILD_ARGS="--dirty --no-remove-work-dir"
-  CONDA_CHANNEL="-c $WORKSPACE/ci/artifacts/cudf/cpu/conda-bld/"
+  CONDA_CHANNEL="-c $WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
 fi
 
 if [ "$BUILD_LIBCUDF" == '1' ]; then
   gpuci_logger "Build conda pkg for libcudf"
-  gpuci_conda_retry build conda/recipes/libcudf $CONDA_BUILD_ARGS
+  gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf $CONDA_BUILD_ARGS
+  mkdir -p ${CONDA_BLD_DIR}/libcudf/work
+  cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work
+
 
   gpuci_logger "Build conda pkg for libcudf_kafka"
-  gpuci_conda_retry build conda/recipes/libcudf_kafka $CONDA_BUILD_ARGS
+  gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf_kafka $CONDA_BUILD_ARGS
+  mkdir -p ${CONDA_BLD_DIR}/libcudf_kafka/work
+  cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf_kafka/work
 fi
 
 if [ "$BUILD_CUDF" == '1' ]; then
   gpuci_logger "Build conda pkg for cudf"
-  gpuci_conda_retry build conda/recipes/cudf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
+  gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
 
   gpuci_logger "Build conda pkg for dask-cudf"
-  gpuci_conda_retry build conda/recipes/dask-cudf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
+  gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
 
   gpuci_logger "Build conda pkg for cudf_kafka"
-  gpuci_conda_retry build conda/recipes/cudf_kafka --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
+  gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
 
   gpuci_logger "Build conda pkg for custreamz"
-  gpuci_conda_retry build conda/recipes/custreamz --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
+  gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
 fi
 ################################################################################
 # UPLOAD - Conda packages
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 0465197e861..ca8ee1d75ac 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -28,12 +28,12 @@ fi
 ################################################################################
 
 gpuci_logger "Get conda file output locations"
-export LIBCUDF_FILE=`conda build conda/recipes/libcudf --output`
-export LIBCUDF_KAFKA_FILE=`conda build conda/recipes/libcudf_kafka --output`
-export CUDF_FILE=`conda build conda/recipes/cudf --python=$PYTHON --output`
-export DASK_CUDF_FILE=`conda build conda/recipes/dask-cudf --python=$PYTHON --output`
-export CUDF_KAFKA_FILE=`conda build conda/recipes/cudf_kafka --python=$PYTHON --output`
-export CUSTREAMZ_FILE=`conda build conda/recipes/custreamz --python=$PYTHON --output`
+export LIBCUDF_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf --output`
+export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf_kafka --output`
+export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output`
+export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output`
+export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output`
+export CUSTREAMZ_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 917a2b4cf27..316f9c5f98a 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -24,6 +24,7 @@ cd $WORKSPACE
 
 # Determine CUDA release version
 export CUDA_REL=${CUDA_VERSION%.*}
+export CONDA_ARTIFACT_PATH=${WORKSPACE}/ci/artifacts/cudf/cpu/.conda-bld/
 
 # Parse git describe
 export GIT_DESCRIBE_TAG=`git describe --tags`
@@ -170,15 +171,15 @@ else
         ${gt} --gtest_output=xml:${WORKSPACE}/test-results/
     done
 
-    CUDF_CONDA_FILE=`find $WORKSPACE/ci/artifacts/cudf/cpu/conda-bld/ -name "libcudf-*.tar.bz2"`
+    CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
     CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension
     CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install
-    KAFKA_CONDA_FILE=`find $WORKSPACE/ci/artifacts/cudf/cpu/conda-bld/ -name "libcudf_kafka-*.tar.bz2"`
+    KAFKA_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf_kafka-*.tar.bz2"`
     KAFKA_CONDA_FILE=`basename "$KAFKA_CONDA_FILE" .tar.bz2` #get filename without extension
     KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install
 
     gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
-    conda install -c $WORKSPACE/ci/artifacts/cudf/cpu/conda-bld/ "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
+    conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
 
     install_dask
 

From 9017f22e7b3d81e2dc5635d7bdbd5ee34c046940 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Mon, 8 Mar 2021 09:22:22 -0800
Subject: [PATCH 2/8] Java support for casting of nested child columns (#7417)

This PR adds a couple of very specialized methods that help us cast columns inside nested types.

Authors:
  - Raza Jafri (@razajafri)

Approvers:
  - Robert (Bobby) Evans (@revans2)
  - Jason Lowe (@jlowe)
  - MithunR (@mythrocks)

URL: https://github.com/rapidsai/cudf/pull/7417
---
 .../java/ai/rapids/cudf/ColumnVector.java     |  10 +-
 .../main/java/ai/rapids/cudf/ColumnView.java  | 144 +++++++++++++++++-
 java/src/main/native/src/ColumnVectorJni.cpp  |  91 -----------
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 101 ++++++++++++
 4 files changed, 247 insertions(+), 99 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 2201fb1fe74..9f414661967 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -167,16 +167,16 @@ private static long getColumnViewFromColumn(long nativePointer) {
     }
   }
 
-
-  private static long initViewHandle(DType type, int rows, int nc, DeviceMemoryBuffer dataBuffer,
-                                     DeviceMemoryBuffer validityBuffer,
-                                     DeviceMemoryBuffer offsetBuffer, long[] childHandles) {
+  static long initViewHandle(DType type, int rows, int nc,
+                                       BaseDeviceMemoryBuffer dataBuffer,
+                                       BaseDeviceMemoryBuffer validityBuffer,
+                                       BaseDeviceMemoryBuffer offsetBuffer, long[] childHandles) {
     long cd = dataBuffer == null ? 0 : dataBuffer.address;
     long cdSize = dataBuffer == null ? 0 : dataBuffer.length;
     long od = offsetBuffer == null ? 0 : offsetBuffer.address;
     long vd = validityBuffer == null ? 0 : validityBuffer.address;
     return makeCudfColumnView(type.typeId.getNativeId(), type.getScale(), cd, cdSize,
-        od, vd, nc, rows, childHandles) ;
+        od, vd, nc, rows, childHandles);
   }
 
   static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 331c5b08764..099f36e65de 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -18,9 +18,8 @@
 
 package ai.rapids.cudf;
 
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Optional;
+import java.util.*;
+import java.util.stream.IntStream;
 
 import static ai.rapids.cudf.HostColumnVector.OFFSET_SIZE;
 
@@ -49,6 +48,65 @@ protected ColumnView(long address) {
     this.nullCount = ColumnView.getNativeNullCount(viewHandle);
   }
 
+  /**
+   * Create a new column view based off of data already on the device. Ref count on the buffers
+   * is not incremented and none of the underlying buffers are owned by this view. The returned
+   * ColumnView is only valid as long as the underlying buffers remain valid. If the buffers are
+   * closed before this ColumnView is closed, it will result in undefined behavior.
+   *
+   * If ownership is needed, call {@link ColumnView#copyToColumnVector}
+   *
+   * @param type           the type of the vector
+   * @param rows           the number of rows in this vector.
+   * @param nullCount      the number of nulls in the dataset.
+   * @param validityBuffer an optional validity buffer. Must be provided if nullCount != 0.
+   *                       The ownership doesn't change on this buffer
+   * @param offsetBuffer   a host buffer required for nested types including strings and string
+   *                       categories. The ownership doesn't change on this buffer
+   * @param children       an array of ColumnView children
+   */
+  public ColumnView(DType type, long rows, Optional<Long> nullCount,
+                     BaseDeviceMemoryBuffer validityBuffer,
+                     BaseDeviceMemoryBuffer offsetBuffer, ColumnView[] children) {
+    this(type, (int) rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        null, validityBuffer, offsetBuffer, children);
+    assert(type.isNestedType());
+    assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
+        || !nullCount.isPresent();
+  }
+
+  /**
+   * Create a new column view based off of data already on the device. Ref count on the buffers
+   * is not incremented and none of the underlying buffers are owned by this view. The returned
+   * ColumnView is only valid as long as the underlying buffers remain valid. If the buffers are
+   * closed before this ColumnView is closed, it will result in undefined behavior.
+   *
+   * If ownership is needed, call {@link ColumnView#copyToColumnVector}
+   *
+   * @param type           the type of the vector
+   * @param rows           the number of rows in this vector.
+   * @param nullCount      the number of nulls in the dataset.
+   * @param dataBuffer     a host buffer required for nested types including strings and string
+   *                       categories. The ownership doesn't change on this buffer
+   * @param validityBuffer an optional validity buffer. Must be provided if nullCount != 0.
+   *                       The ownership doesn't change on this buffer
+   */
+  public ColumnView(DType type, long rows, Optional<Long> nullCount,
+                    BaseDeviceMemoryBuffer dataBuffer,
+                    BaseDeviceMemoryBuffer validityBuffer) {
+    this(type, (int) rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer, null, null);
+    assert (!type.isNestedType());
+    assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
+        || !nullCount.isPresent();
+  }
+
+  private ColumnView(DType type, long rows, int nullCount,
+                     BaseDeviceMemoryBuffer dataBuffer, BaseDeviceMemoryBuffer validityBuffer,
+                     BaseDeviceMemoryBuffer offsetBuffer, ColumnView[] children) {
+    this(ColumnVector.initViewHandle(type, (int) rows, nullCount, dataBuffer, validityBuffer,
+        offsetBuffer, Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
+  }
 
   /** Creates a ColumnVector from a column view handle
    * @return a new ColumnVector
@@ -1296,6 +1354,86 @@ public ColumnVector castTo(DType type) {
     return new ColumnVector(castTo(getNativeView(), type.typeId.getNativeId(), type.getScale()));
   }
 
+  /**
+   * This method takes in a nested type and replaces its children with the given views
+   * Note: Make sure the numbers of rows in the leaf node are the same as the child replacing it
+   * otherwise the list can point to elements outside of the column values.
+   *
+   * Note: this method returns a ColumnView that won't live past the ColumnVector that it's
+   * pointing to.
+   *
+   * Ex: List<Int> list = col{{1,3}, {9,3,5}}
+   *
+   * validNewChild = col{8, 3, 9, 2, 0}
+   *
+   * list.replaceChildrenWithViews(1, validNewChild) => col{{8, 3}, {9, 2, 0}}
+   *
+   * invalidNewChild = col{3, 2}
+   * list.replaceChildrenWithViews(1, invalidNewChild) => col{{3, 2}, {invalid, invalid, invalid}}
+   *
+   * invalidNewChild = col{8, 3, 9, 2, 0, 0, 7}
+   * list.replaceChildrenWithViews(1, invalidNewChild) => col{{8, 3}, {9, 2, 0}} // undefined result
+   */
+  public ColumnView replaceChildrenWithViews(int[] indices,
+                                             ColumnView[] views) {
+    assert (type.isNestedType());
+    assert (indices.length == views.length);
+    if (type == DType.LIST) {
+      assert (indices.length == 1);
+    }
+    if (indices.length != views.length) {
+      throw new IllegalArgumentException("The indices size and children size should match");
+    }
+    Map<Integer, ColumnView> map = new HashMap<>();
+    IntStream.range(0, indices.length).forEach(index -> {
+      if (map.containsKey(indices[index])) {
+        throw new IllegalArgumentException("Duplicate mapping found for replacing child index");
+      }
+      map.put(indices[index], views[index]);
+    });
+    List<ColumnView> newChildren = new ArrayList<>(getNumChildren());
+    IntStream.range(0, getNumChildren()).forEach(i -> {
+      ColumnView view = map.remove(i);
+      if (view == null) {
+        newChildren.add(getChildColumnView(i));
+      } else {
+        newChildren.add(view);
+      }
+    });
+    if (!map.isEmpty()) {
+      throw new IllegalArgumentException("One or more invalid child indices passed to be replaced");
+    }
+    return new ColumnView(type, getRowCount(), Optional.of(getNullCount()), getValid(),
+        getOffsets(), newChildren.stream().toArray(n -> new ColumnView[n]));
+  }
+
+  /**
+   * This method takes in a list and returns a new list with the leaf node replaced with the given
+   * view. Make sure the numbers of rows in the leaf node are the same as the child replacing it
+   * otherwise the list can point to elements outside of the column values.
+   *
+   * Note: this method returns a ColumnView that won't live past the ColumnVector that it's
+   * pointing to.
+   *
+   * Ex: List<Int> list = col{{1,3}, {9,3,5}}
+   *
+   * validNewChild = col{8, 3, 9, 2, 0}
+   *
+   * list.replaceChildrenWithViews(1, validNewChild) => col{{8, 3}, {9, 2, 0}}
+   *
+   * invalidNewChild = col{3, 2}
+   * list.replaceChildrenWithViews(1, invalidNewChild) =>
+   *        col{{3, 2}, {invalid, invalid, invalid}} throws an exception
+   *
+   * invalidNewChild = col{8, 3, 9, 2, 0, 0, 7}
+   * list.replaceChildrenWithViews(1, invalidNewChild) =>
+   *       col{{8, 3}, {9, 2, 0}} throws an exception
+   */
+  public ColumnView replaceListChild(ColumnView child) {
+    assert(type == DType.LIST);
+    return replaceChildrenWithViews(new int[]{1}, new ColumnView[]{child});
+  }
+
   /**
    * Zero-copy cast between types with the same underlying representation.
    *
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 3385343c291..737abea6f13 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -31,7 +31,6 @@
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
 
-
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, jclass,
@@ -315,96 +314,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNI
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeNumericCudfColumn(
-    JNIEnv *env, jobject j_object, jint j_type, jint j_size, jint j_mask_state) {
-
-  JNI_ARG_CHECK(env, (j_size != 0), "size is 0", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
-    cudf::data_type n_data_type(n_type);
-    cudf::size_type n_size = static_cast<cudf::size_type>(j_size);
-    cudf::mask_state n_mask_state = static_cast<cudf::mask_state>(j_mask_state);
-    std::unique_ptr<cudf::column> column(
-        cudf::make_numeric_column(n_data_type, n_size, n_mask_state));
-    return reinterpret_cast<jlong>(column.release());
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeTimestampCudfColumn(
-    JNIEnv *env, jobject j_object, jint j_type, jint j_size, jint j_mask_state) {
-
-  JNI_NULL_CHECK(env, j_type, "type id is null", 0);
-  JNI_NULL_CHECK(env, j_size, "size is null", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
-    std::unique_ptr<cudf::data_type> n_data_type(new cudf::data_type(n_type));
-    cudf::size_type n_size = static_cast<cudf::size_type>(j_size);
-    cudf::mask_state n_mask_state = static_cast<cudf::mask_state>(j_mask_state);
-    std::unique_ptr<cudf::column> column(
-        cudf::make_timestamp_column(*n_data_type.get(), n_size, n_mask_state));
-    return reinterpret_cast<jlong>(column.release());
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeStringCudfColumnHostSide(
-    JNIEnv *env, jobject j_object, jlong j_char_data, jlong j_offset_data, jlong j_valid_data,
-    jint j_null_count, jint size) {
-
-  JNI_ARG_CHECK(env, (size != 0), "size is 0", 0);
-  JNI_NULL_CHECK(env, j_char_data, "char data is null", 0);
-  JNI_NULL_CHECK(env, j_offset_data, "offset is null", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::size_type *host_offsets = reinterpret_cast<cudf::size_type *>(j_offset_data);
-    char *n_char_data = reinterpret_cast<char *>(j_char_data);
-    cudf::size_type n_data_size = host_offsets[size];
-    cudf::bitmask_type *n_validity = reinterpret_cast<cudf::bitmask_type *>(j_valid_data);
-
-    if (n_validity == nullptr) {
-      j_null_count = 0;
-    }
-
-    std::unique_ptr<cudf::column> offsets = cudf::make_numeric_column(
-        cudf::data_type{cudf::type_id::INT32}, size + 1, cudf::mask_state::UNALLOCATED);
-    auto offsets_view = offsets->mutable_view();
-    JNI_CUDA_TRY(env, 0,
-                 cudaMemcpyAsync(offsets_view.data<int32_t>(), host_offsets,
-                                 (size + 1) * sizeof(int32_t), cudaMemcpyHostToDevice));
-
-    std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
-        cudf::data_type{cudf::type_id::INT8}, n_data_size, cudf::mask_state::UNALLOCATED);
-    auto data_view = data->mutable_view();
-    JNI_CUDA_TRY(env, 0,
-                 cudaMemcpyAsync(data_view.data<int8_t>(), n_char_data, n_data_size,
-                                 cudaMemcpyHostToDevice));
-
-    std::unique_ptr<cudf::column> column;
-    if (j_null_count == 0) {
-      column =
-          cudf::make_strings_column(size, std::move(offsets), std::move(data), j_null_count, {});
-    } else {
-      cudf::size_type bytes = (cudf::word_index(size) + 1) * sizeof(cudf::bitmask_type);
-      rmm::device_buffer dev_validity(bytes);
-      JNI_CUDA_TRY(env, 0,
-                   cudaMemcpyAsync(dev_validity.data(), n_validity, bytes, cudaMemcpyHostToDevice));
-
-      column = cudf::make_strings_column(size, std::move(offsets), std::move(data), j_null_count,
-                                         std::move(dev_validity));
-    }
-
-    JNI_CUDA_TRY(env, 0, cudaStreamSynchronize(0));
-    return reinterpret_cast<jlong>(column.release());
-  }
-  CATCH_STD(env, 0);
-}
-
 JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeNullCountColumn(JNIEnv *env,
                                                                                  jobject j_object,
                                                                                  jlong handle) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index a3500ae86ef..75f58179382 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3951,4 +3951,105 @@ void testMakeList() {
       assertColumnsAreEqual(expected, created);
     }
   }
+
+  @Test
+  void testReplaceLeafNodeInList() {
+    try (
+        ColumnVector c1 = ColumnVector.fromInts(1, 2);
+        ColumnVector c2 = ColumnVector.fromInts(8, 3);
+        ColumnVector c3 = ColumnVector.fromInts(9, 8);
+        ColumnVector c4 = ColumnVector.fromInts(2, 6);
+        ColumnVector expected = ColumnVector.makeList(c1, c2, c3, c4);
+        ColumnVector child1 =
+            ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                RoundingMode.HALF_UP, 770.892, 961.110);
+        ColumnVector child2 =
+            ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                RoundingMode.HALF_UP, 524.982, 479.946);
+        ColumnVector child3 =
+            ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                RoundingMode.HALF_UP, 346.997, 479.946);
+        ColumnVector child4 =
+            ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                RoundingMode.HALF_UP, 87.764, 414.239);
+        ColumnVector created = ColumnVector.makeList(child1, child2, child3, child4);
+        ColumnVector newChild = ColumnVector.fromInts(1, 8, 9, 2, 2, 3, 8, 6);
+        ColumnView replacedView = created.replaceListChild(newChild)) {
+      try (ColumnVector replaced = replacedView.copyToColumnVector()) {
+        assertColumnsAreEqual(expected, replaced);
+      }
+    }
+  }
+
+  @Test
+  void testReplaceLeafNodeInListWithIllegal() {
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (ColumnVector child1 =
+               ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                   RoundingMode.HALF_UP, 770.892, 961.110);
+           ColumnVector child2 =
+               ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                   RoundingMode.HALF_UP, 524.982, 479.946);
+           ColumnVector child3 =
+               ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                   RoundingMode.HALF_UP, 346.997, 479.946);
+           ColumnVector child4 =
+               ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
+                   RoundingMode.HALF_UP, 87.764, 414.239);
+           ColumnVector created = ColumnVector.makeList(child1, child2, child3, child4);
+           ColumnVector newChild = ColumnVector.fromInts(0, 1, 8, 9, 2, 2, 3, 8, 6);
+           ColumnView replacedView = created.replaceListChild(newChild)) {
+      }
+    });
+  }
+
+  @Test
+  void testReplaceColumnInStruct() {
+    try (ColumnVector expected = ColumnVector.fromStructs(new StructType(false,
+            Arrays.asList(
+                new BasicType(false, DType.INT32),
+                new BasicType(false, DType.INT32),
+                new BasicType(false, DType.INT32))),
+        new HostColumnVector.StructData(1, 5, 3),
+        new HostColumnVector.StructData(4, 9, 6));
+         ColumnVector child1 = ColumnVector.fromInts(1, 4);
+         ColumnVector child2 = ColumnVector.fromInts(2, 5);
+         ColumnVector child3 = ColumnVector.fromInts(3, 6);
+         ColumnVector created = ColumnVector.makeStruct(child1, child2, child3);
+         ColumnVector replaceWith = ColumnVector.fromInts(5, 9);
+         ColumnView replacedView = created.replaceChildrenWithViews(new int[]{1},
+             new ColumnVector[]{replaceWith})) {
+      try (ColumnVector replaced = replacedView.copyToColumnVector()) {
+        assertColumnsAreEqual(expected, replaced);
+      }
+    }
+  }
+
+  @Test
+  void testReplaceIllegalIndexColumnInStruct() {
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (ColumnVector child1 = ColumnVector.fromInts(1, 4);
+           ColumnVector child2 = ColumnVector.fromInts(2, 5);
+           ColumnVector child3 = ColumnVector.fromInts(3, 6);
+           ColumnVector created = ColumnVector.makeStruct(child1, child2, child3);
+           ColumnVector replaceWith = ColumnVector.fromInts(5, 9);
+           ColumnView replacedView = created.replaceChildrenWithViews(new int[]{5},
+               new ColumnVector[]{replaceWith})) {
+      }
+    });
+  }
+
+  @Test
+  void testReplaceSameIndexColumnInStruct() {
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (ColumnVector child1 = ColumnVector.fromInts(1, 4);
+           ColumnVector child2 = ColumnVector.fromInts(2, 5);
+           ColumnVector child3 = ColumnVector.fromInts(3, 6);
+           ColumnVector created = ColumnVector.makeStruct(child1, child2, child3);
+           ColumnVector replaceWith = ColumnVector.fromInts(5, 9);
+           ColumnView replacedView = created.replaceChildrenWithViews(new int[]{1, 1},
+               new ColumnVector[]{replaceWith, replaceWith})) {
+      }
+    });
+  }
 }

From 3480e2e8e658ac787abffff1834756155e0087cb Mon Sep 17 00:00:00 2001
From: rwlee <rwlee@users.noreply.github.com>
Date: Mon, 8 Mar 2021 11:07:46 -0800
Subject: [PATCH 3/8] bitmask_or implementation with bitmask refactor (#7406)

Refactors the bitmask merging functionality to support any binary function, allowing for `bitwise_or` support in addition the existing `bitwise_and` support. Includes changes to the Java api and JNI to access the `bitwise_or` functionality.

Authors:
  - @rwlee

Approvers:
  - Jason Lowe (@jlowe)
  - Jake Hemstad (@jrhemstad)
  - Christopher Harris (@cwharris)

URL: https://github.com/rapidsai/cudf/pull/7406
---
 .../cudf/column/column_device_view.cuh        |   2 +-
 cpp/include/cudf/detail/null_mask.cuh         | 148 ++++++++++++++++++
 cpp/include/cudf/detail/null_mask.hpp         |  35 +++--
 cpp/include/cudf/null_mask.hpp                |  16 +-
 cpp/src/bitmask/null_mask.cu                  | 131 ++++++++--------
 cpp/src/structs/structs_column_factories.cu   |  24 +--
 cpp/tests/bitmask/bitmask_tests.cu            |  64 +++++++-
 .../main/java/ai/rapids/cudf/ColumnView.java  |   2 +-
 java/src/main/native/src/ColumnViewJni.cpp    |  11 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  24 ++-
 10 files changed, 357 insertions(+), 100 deletions(-)
 create mode 100644 cpp/include/cudf/detail/null_mask.cuh

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index c29beb65775..b2f152180b0 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -856,7 +856,7 @@ namespace detail {
  * @brief Convenience function to get offset word from a bitmask
  *
  * @see copy_offset_bitmask
- * @see offset_bitmask_and
+ * @see offset_bitmask_binop
  */
 __device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restrict__ source,
                                                     size_type destination_word_index,
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
new file mode 100644
index 00000000000..daefa2a5ffd
--- /dev/null
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+using cudf::detail::device_span;
+
+namespace cudf {
+namespace detail {
+/**
+ * @brief Computes the merger of an array of bitmasks using a binary operator
+ *
+ * @param op The binary operator used to combine the bitmasks
+ * @param destination The bitmask to write result into
+ * @param source Array of source mask pointers. All masks must be of same size
+ * @param source_begin_bits Array of offsets into corresponding @p source masks.
+ *                          Must be same size as source array
+ * @param source_size_bits Number of bits in each mask in @p source
+ */
+template <typename Binop>
+__global__ void offset_bitmask_binop(Binop op,
+                                     device_span<bitmask_type> destination,
+                                     device_span<bitmask_type const *> source,
+                                     device_span<size_type const> source_begin_bits,
+                                     size_type source_size_bits)
+{
+  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
+       destination_word_index < destination.size();
+       destination_word_index += blockDim.x * gridDim.x) {
+    bitmask_type destination_word =
+      detail::get_mask_offset_word(source[0],
+                                   destination_word_index,
+                                   source_begin_bits[0],
+                                   source_begin_bits[0] + source_size_bits);
+    for (size_type i = 1; i < source.size(); i++) {
+      destination_word =
+
+        op(destination_word,
+           detail::get_mask_offset_word(source[i],
+                                        destination_word_index,
+                                        source_begin_bits[i],
+                                        source_begin_bits[i] + source_size_bits));
+    }
+
+    destination[destination_word_index] = destination_word;
+  }
+}
+
+/**
+ * @copydoc bitmask_binop(Binop op, host_span<bitmask_type const *> const, host_span<size_type>
+ * const, size_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+template <typename Binop>
+rmm::device_buffer bitmask_binop(
+  Binop op,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+{
+  auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
+
+  inplace_bitmask_binop(op,
+                        device_span<bitmask_type>(static_cast<bitmask_type *>(dest_mask.data()),
+                                                  num_bitmask_words(mask_size_bits)),
+                        masks,
+                        masks_begin_bits,
+                        mask_size_bits,
+                        stream,
+                        mr);
+
+  return dest_mask;
+}
+
+/**
+ * @brief Performs a merge of the specified bitmasks using the binary operator
+ *        provided, and writes in place to destination
+ *
+ * @param op The binary operator used to combine the bitmasks
+ * @param dest_mask Destination to which the merged result is written
+ * @param masks The list of data pointers of the bitmasks to be merged
+ * @param masks_begin_bits The bit offsets from which each mask is to be merged
+ * @param mask_size_bits The number of bits to be ANDed in each mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+template <typename Binop>
+void inplace_bitmask_binop(
+  Binop op,
+  device_span<bitmask_type> dest_mask,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(
+    std::all_of(masks_begin_bits.begin(), masks_begin_bits.end(), [](auto b) { return b >= 0; }),
+    "Invalid range.");
+  CUDF_EXPECTS(mask_size_bits > 0, "Invalid bit range.");
+  CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
+               "Mask pointer cannot be null");
+
+  rmm::device_uvector<bitmask_type const *> d_masks(masks.size(), stream, mr);
+  rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
+
+  CUDA_TRY(cudaMemcpyAsync(
+    d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyHostToDevice, stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
+                           masks_begin_bits.data(),
+                           masks_begin_bits.size_bytes(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  cudf::detail::grid_1d config(dest_mask.size(), 256);
+  offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+    op, dest_mask, d_masks, d_begin_bits, mask_size_bits);
+  CHECK_CUDA(stream.value());
+  stream.synchronize();
+}
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 2f2bc91cb74..b0870ef8d9a 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -88,15 +89,15 @@ rmm::device_buffer copy_bitmask(
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc bitmask_and(std::vector<bitmask_type const*>, std::vector<size_type> const&, size_type,
- * rmm::mr::device_memory_resource *)
+ * @copydoc bitmask_and(host_span<bitmask_type const *> const, host_span<size_type> const,
+ * size_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 rmm::device_buffer bitmask_and(
-  std::vector<bitmask_type const *> const &masks,
-  std::vector<size_type> const &begin_bits,
-  size_type mask_size,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
@@ -110,23 +111,33 @@ rmm::device_buffer bitmask_and(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::bitmask_or
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+rmm::device_buffer bitmask_or(
+  table_view const &view,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
  *        and writes in place to destination
  *
  * @param dest_mask Destination to which the AND result is written
  * @param masks The list of data pointers of the bitmasks to be ANDed
- * @param begin_bits The bit offsets from which each mask is to be ANDed
- * @param mask_size The number of bits to be ANDed in each mask
+ * @param masks_begin_bits The bit offsets from which each mask is to be ANDed
+ * @param mask_size_bits The number of bits to be ANDed in each mask
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer Output bitmask
  */
 void inplace_bitmask_and(
-  bitmask_type *dest_mask,
-  std::vector<bitmask_type const *> const &masks,
-  std::vector<size_type> const &begin_bits,
-  size_type mask_size,
+  device_span<bitmask_type> dest_mask,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 5e1f0f0802e..0d4de1a9beb 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -220,5 +220,19 @@ rmm::device_buffer bitmask_and(
   table_view const& view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a bitwise OR of the bitmasks of columns of a table
+ *
+ * If any of the columns isn't nullable, it is considered all valid.
+ * If no column in the table is nullable, an empty bitmask is returned.
+ *
+ * @param view The table of columns
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+rmm::device_buffer bitmask_or(
+  table_view const& view,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 78188b26473..4a2a7db9638 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -23,10 +24,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -41,6 +44,8 @@
 #include <numeric>
 #include <type_traits>
 
+using cudf::detail::device_span;
+
 namespace cudf {
 size_type state_null_count(mask_state state, size_type size)
 {
@@ -316,37 +321,6 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
   }
 }
 
-/**
- * @brief Computes the bitwise AND of an array of bitmasks
- *
- * @param destination The bitmask to write result into
- * @param source Array of source mask pointers. All masks must be of same size
- * @param begin_bit Array of offsets into corresponding @p source masks.
- *                  Must be same size as source array
- * @param num_sources Number of masks in @p source array
- * @param source_size Number of bits in each mask in @p source
- * @param number_of_mask_words The number of words of type bitmask_type to copy
- */
-__global__ void offset_bitmask_and(bitmask_type *__restrict__ destination,
-                                   bitmask_type const *const *__restrict__ source,
-                                   size_type const *__restrict__ begin_bit,
-                                   size_type num_sources,
-                                   size_type source_size,
-                                   size_type number_of_mask_words)
-{
-  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
-       destination_word_index < number_of_mask_words;
-       destination_word_index += blockDim.x * gridDim.x) {
-    bitmask_type destination_word = ~bitmask_type{0};  // All bits 1
-    for (size_type i = 0; i < num_sources; i++) {
-      destination_word &= detail::get_mask_offset_word(
-        source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size);
-    }
-
-    destination[destination_word_index] = destination_word;
-  }
-}
-
 // convert [first_bit_index,last_bit_index) to
 // [first_word_index,last_word_index)
 struct to_word_index : public thrust::unary_function<size_type, size_type> {
@@ -422,51 +396,37 @@ rmm::device_buffer copy_bitmask(column_view const &view,
 }
 
 // Inplace Bitwise AND of the masks
-void inplace_bitmask_and(bitmask_type *dest_mask,
-                         std::vector<bitmask_type const *> const &masks,
-                         std::vector<size_type> const &begin_bits,
+void inplace_bitmask_and(device_span<bitmask_type> dest_mask,
+                         host_span<bitmask_type const *> masks,
+                         host_span<size_type const> begin_bits,
                          size_type mask_size,
                          rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource *mr)
 {
-  CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
-               "Invalid range.");
-  CUDF_EXPECTS(mask_size > 0, "Invalid bit range.");
-  CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
-               "Mask pointer cannot be null");
-
-  auto number_of_mask_words = num_bitmask_words(mask_size);
-
-  rmm::device_vector<bitmask_type const *> d_masks(masks);
-  rmm::device_vector<size_type> d_begin_bits(begin_bits);
-
-  cudf::detail::grid_1d config(number_of_mask_words, 256);
-  offset_bitmask_and<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+  inplace_bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
     dest_mask,
-    d_masks.data().get(),
-    d_begin_bits.data().get(),
-    d_masks.size(),
+    masks,
+    begin_bits,
     mask_size,
-    number_of_mask_words);
-
-  CHECK_CUDA(stream.value());
+    stream,
+    mr);
 }
 
 // Bitwise AND of the masks
-rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
-                               std::vector<size_type> const &begin_bits,
+rmm::device_buffer bitmask_and(host_span<bitmask_type const *> masks,
+                               host_span<size_type const> begin_bits,
                                size_type mask_size,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource *mr)
 {
-  rmm::device_buffer dest_mask{};
-  auto num_bytes = bitmask_allocation_size_bytes(mask_size);
-
-  dest_mask = rmm::device_buffer{num_bytes, stream, mr};
-  inplace_bitmask_and(
-    static_cast<bitmask_type *>(dest_mask.data()), masks, begin_bits, mask_size, stream, mr);
-
-  return dest_mask;
+  return bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+    masks,
+    begin_bits,
+    mask_size,
+    stream,
+    mr);
 }
 
 cudf::size_type count_set_bits(bitmask_type const *bitmask,
@@ -651,12 +611,48 @@ rmm::device_buffer bitmask_and(table_view const &view,
   }
 
   if (masks.size() > 0) {
-    return cudf::detail::bitmask_and(masks, offsets, view.num_rows(), stream, mr);
+    return cudf::detail::bitmask_binop(
+      [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+      masks,
+      offsets,
+      view.num_rows(),
+      stream,
+      mr);
   }
 
   return null_mask;
 }
 
+// Returns the bitwise OR of the null masks of all columns in the table view
+rmm::device_buffer bitmask_or(table_view const &view,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource *mr)
+{
+  CUDF_FUNC_RANGE();
+  rmm::device_buffer null_mask{0, stream, mr};
+  if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; }
+
+  std::vector<bitmask_type const *> masks;
+  std::vector<size_type> offsets;
+  for (auto &&col : view) {
+    if (col.nullable()) {
+      masks.push_back(col.null_mask());
+      offsets.push_back(col.offset());
+    }
+  }
+
+  if (static_cast<size_type>(masks.size()) == view.num_columns()) {
+    return cudf::detail::bitmask_binop(
+      [] __device__(bitmask_type left, bitmask_type right) { return left | right; },
+      masks,
+      offsets,
+      view.num_rows(),
+      stream,
+      mr);
+  }
+
+  return null_mask;
+}
 }  // namespace detail
 
 // Count non-zero bits in the specified range
@@ -709,4 +705,9 @@ rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_re
   return detail::bitmask_and(view, rmm::cuda_stream_default, mr);
 }
 
+rmm::device_buffer bitmask_or(table_view const &view, rmm::mr::device_memory_resource *mr)
+{
+  return detail::bitmask_or(view, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index 5f92fea76f5..2bd71767265 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,6 @@
 
 #include <algorithm>
 #include <memory>
-
 namespace cudf {
 namespace {
 // Helper function to superimpose validity of parent struct
@@ -44,18 +43,19 @@ void superimpose_parent_nullmask(bitmask_type const* parent_null_mask,
     // Child should have a null mask.
     // `AND` the child's null mask with the parent's.
 
-    auto data_type{child.type()};
-    auto num_rows{child.size()};
-
     auto current_child_mask = child.mutable_view().null_mask();
 
-    cudf::detail::inplace_bitmask_and(current_child_mask,
-                                      {reinterpret_cast<bitmask_type const*>(parent_null_mask),
-                                       reinterpret_cast<bitmask_type const*>(current_child_mask)},
-                                      {0, 0},
-                                      child.size(),
-                                      stream,
-                                      mr);
+    std::vector<bitmask_type const*> masks{
+      reinterpret_cast<bitmask_type const*>(parent_null_mask),
+      reinterpret_cast<bitmask_type const*>(current_child_mask)};
+    std::vector<size_type> begin_bits{0, 0};
+    cudf::detail::inplace_bitmask_and(
+      detail::device_span<bitmask_type>(current_child_mask, num_bitmask_words(child.size())),
+      masks,
+      begin_bits,
+      child.size(),
+      stream,
+      mr);
     child.set_null_count(UNKNOWN_NULL_COUNT);
   }
 
diff --git a/cpp/tests/bitmask/bitmask_tests.cu b/cpp/tests/bitmask/bitmask_tests.cu
index 8afa4faa9e3..2f820da687e 100644
--- a/cpp/tests/bitmask/bitmask_tests.cu
+++ b/cpp/tests/bitmask/bitmask_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
+#include <rmm/device_buffer.hpp>
 
 struct BitmaskUtilitiesTest : public cudf::test::BaseFixture {
 };
@@ -413,7 +414,7 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
   cleanEndWord(splice_mask, begin_bit, end_bit);
   auto number_of_bits = end_bit - begin_bit;
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(
-    gold_splice_mask.data(), splice_mask.data(), number_of_bits / CHAR_BIT);
+    gold_splice_mask.data(), splice_mask.data(), cudf::num_bitmask_words(number_of_bits));
 }
 
 TEST_F(CopyBitmaskTest, TestNonZeroOffset)
@@ -433,7 +434,7 @@ TEST_F(CopyBitmaskTest, TestNonZeroOffset)
   cleanEndWord(splice_mask, begin_bit, end_bit);
   auto number_of_bits = end_bit - begin_bit;
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(
-    gold_splice_mask.data(), splice_mask.data(), number_of_bits / CHAR_BIT);
+    gold_splice_mask.data(), splice_mask.data(), cudf::num_bitmask_words(number_of_bits));
 }
 
 TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
@@ -468,7 +469,7 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
   rmm::device_buffer concatenated_bitmask = cudf::concatenate_masks(views);
   cleanEndWord(concatenated_bitmask, 0, num_elements);
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(
-    concatenated_bitmask.data(), gold_mask.data(), num_elements / CHAR_BIT);
+    concatenated_bitmask.data(), gold_mask.data(), cudf::num_bitmask_words(num_elements));
 }
 
 TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
@@ -493,7 +494,60 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
   rmm::device_buffer concatenated_bitmask = cudf::concatenate_masks(views);
   cleanEndWord(concatenated_bitmask, 0, num_elements);
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(
-    concatenated_bitmask.data(), gold_mask.data(), num_elements / CHAR_BIT);
+    concatenated_bitmask.data(), gold_mask.data(), cudf::num_bitmask_words(num_elements));
+}
+
+struct MergeBitmaskTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(MergeBitmaskTest, TestBitmaskAnd)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col3({0, 2, 1, 0, 255});
+
+  auto const input1 = cudf::table_view({bools_col3});
+  auto const input2 = cudf::table_view({bools_col1, bools_col2});
+  auto const input3 = cudf::table_view({bools_col1, bools_col2, bools_col3});
+
+  rmm::device_buffer result1 = cudf::bitmask_and(input1);
+  rmm::device_buffer result2 = cudf::bitmask_and(input2);
+  rmm::device_buffer result3 = cudf::bitmask_and(input3);
+
+  auto odd_indices =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  auto odd = cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows());
+
+  EXPECT_EQ(nullptr, result1.data());
+  CUDF_TEST_EXPECT_EQUAL_BUFFERS(
+    result2.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows()));
+  CUDF_TEST_EXPECT_EQUAL_BUFFERS(
+    result3.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows()));
+}
+
+TEST_F(MergeBitmaskTest, TestBitmaskOr)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {0, 0, 1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col3({0, 2, 1, 0, 255});
+
+  auto const input1 = cudf::table_view({bools_col3});
+  auto const input2 = cudf::table_view({bools_col1, bools_col2});
+  auto const input3 = cudf::table_view({bools_col1, bools_col2, bools_col3});
+
+  rmm::device_buffer result1 = cudf::bitmask_or(input1);
+  rmm::device_buffer result2 = cudf::bitmask_or(input2);
+  rmm::device_buffer result3 = cudf::bitmask_or(input3);
+
+  auto all_but_index3 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
+  auto null3 =
+    cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows());
+
+  EXPECT_EQ(nullptr, result1.data());
+  CUDF_TEST_EXPECT_EQUAL_BUFFERS(
+    result2.data(), null3.data(), cudf::num_bitmask_words(input2.num_rows()));
+  EXPECT_EQ(nullptr, result3.data());
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 099f36e65de..e0cc96263b3 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -578,7 +578,7 @@ public final ColumnVector normalizeNANsAndZeros() {
    * @return the new ColumnVector with merged null mask.
    */
   public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... columns) {
-    assert mergeOp == BinaryOp.BITWISE_AND : "Only BITWISE_AND supported right now";
+    assert mergeOp == BinaryOp.BITWISE_AND || mergeOp == BinaryOp.BITWISE_OR : "Only BITWISE_AND and BITWISE_OR supported right now";
     long[] columnViews = new long[columns.length];
     long size = getRowCount();
 
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index a0613f9b73f..e8474bda1be 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1305,8 +1305,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
     cudf::table_view *input_table = new cudf::table_view(column_views);
 
     cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
-    if(op == cudf::binary_operator::BITWISE_AND) {
-      copy->set_null_mask(cudf::bitmask_and(*input_table));
+    switch(op) {
+      case cudf::binary_operator::BITWISE_AND:
+        copy->set_null_mask(cudf::bitmask_and(*input_table));
+        break;
+      case cudf::binary_operator::BITWISE_OR:
+        copy->set_null_mask(cudf::bitmask_or(*input_table));
+        break;
+      default:
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0);
     }
 
     return reinterpret_cast<jlong>(copy.release());
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 75f58179382..0675ece4863 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -571,7 +571,7 @@ void testSpark32BitMurmur3HashMixed() {
   }
 
   @Test
-  void testNullReconfigureNulls() {
+  void testAndNullReconfigureNulls() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
          ColumnVector v1 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, null);
          ColumnVector intResult = v1.mergeAndSetValidity(BinaryOp.BITWISE_AND, v0);
@@ -585,6 +585,28 @@ void testNullReconfigureNulls() {
     }
   }
 
+  @Test
+  void testOrNullReconfigureNulls() {
+    try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
+         ColumnVector v1 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, null);
+         ColumnVector v2 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, Integer.MAX_VALUE);
+         ColumnVector intResultV0 = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0);
+         ColumnVector intResultV0V1 = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v1);
+         ColumnVector intResultMulti = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v0, v1, v1, v0, v1, v0);
+         ColumnVector intResultv0v1v2 = v2.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v1, v2);
+         ColumnVector v3 = ColumnVector.fromStrings("0", "100", "1", "2", "MIN_VALUE", "3");
+         ColumnVector stringResult = v3.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v1);
+         ColumnVector stringExpected = ColumnVector.fromStrings("0", "100", "1", "2", "MIN_VALUE", null);
+         ColumnVector noMaskResult = v3.mergeAndSetValidity(BinaryOp.BITWISE_OR)) {
+      assertColumnsAreEqual(v0, intResultV0);
+      assertColumnsAreEqual(v1, intResultV0V1);
+      assertColumnsAreEqual(v1, intResultMulti);
+      assertColumnsAreEqual(v2, intResultv0v1v2);
+      assertColumnsAreEqual(stringExpected, stringResult);
+      assertColumnsAreEqual(v3, noMaskResult);
+    }
+  }
+
   @Test
   void isNotNullTestEmptyColumn() {
     try (ColumnVector v = ColumnVector.fromBoxedInts();

From ddc09906492c60267766098e1423ee8e657a914c Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 8 Mar 2021 12:05:05 -0800
Subject: [PATCH 4/8] Resolving unlinked type shorthands in cudf doc (#7416)

Closes #7320

This PR adds an additional preprocessing step in documentation generation. It traverses through the doctree generated by Sphinx and replaces unresolved type short hands with proper target reference, while keeping the shortened name for display text.

An additional preprocessing step is added to ignore internal types to APIs facing both internally and externally, such as `cudf.core.column.string.StringColumn`

`cupy` API reference is added to intersphinx.

Minor changes:
- Fixes a small doc bug in `frame.copy`

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Ashwin Srinath (@shwina)

URL: https://github.com/rapidsai/cudf/pull/7416
---
 docs/cudf/source/conf.py               | 59 ++++++++++++++++++++++----
 python/cudf/cudf/core/column/string.py |  2 +-
 2 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 54866ff6eee..b68d7b5849f 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -21,7 +21,10 @@
 #
 import os
 import sys
+
+from docutils.nodes import Text
 from recommonmark.transform import AutoStructify
+from sphinx.addnodes import pending_xref
 
 sys.path.insert(0, os.path.abspath("../.."))
 
@@ -74,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.19'
+version = "0.19"
 # The full version, including alpha/beta/rc tags.
-release = '0.19.0'
+release = "0.19.0"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -193,7 +196,10 @@
 
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {"https://docs.python.org/": None}
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/", None),
+    "cupy": ("https://docs.cupy.dev/en/stable/", None),
+}
 
 # Config numpydoc
 numpydoc_show_inherited_class_members = True
@@ -202,14 +208,51 @@
 autoclass_content = "init"
 
 # Config AutoStructify
-github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/'
+github_doc_root = "https://github.com/rtfd/recommonmark/tree/master/doc/"
+
+# Replace API shorthands with fullname
+_reftarget_aliases = {
+    "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
+    "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
+    "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
+}
+
+_internal_names_to_ignore = {"cudf.core.column.string.StringColumn"}
+
+
+def resolve_aliases(app, doctree):
+    pending_xrefs = doctree.traverse(condition=pending_xref)
+    for node in pending_xrefs:
+        alias = node.get("reftarget", None)
+        if alias is not None and alias in _reftarget_aliases:
+            real_ref, text_to_render = _reftarget_aliases[alias]
+            node["reftarget"] = real_ref
+
+            text_node = next(
+                iter(node.traverse(lambda n: n.tagname == "#text"))
+            )
+            text_node.parent.replace(text_node, Text(text_to_render, ""))
+
+
+def ignore_internal_references(app, env, node, contnode):
+    name = node.get("reftarget", None)
+    if name is not None and name in _internal_names_to_ignore:
+        node["reftarget"] = ""
+        return contnode
+
 
 def setup(app):
     app.add_js_file("copybutton_pydocs.js")
     app.add_css_file("params.css")
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
-    app.add_config_value('recommonmark_config', {
-            'url_resolver': lambda url: github_doc_root + url,
-            'auto_toc_tree_section': 'Contents',
-            }, True)
+    app.add_config_value(
+        "recommonmark_config",
+        {
+            "url_resolver": lambda url: github_doc_root + url,
+            "auto_toc_tree_section": "Contents",
+        },
+        True,
+    )
     app.add_transform(AutoStructify)
+    app.connect("doctree-read", resolve_aliases)
+    app.connect("missing-reference", ignore_internal_references)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 0a1f6529cc7..81abdd3f66a 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -347,7 +347,7 @@ def cat(self, sep: str = None, na_rep: str = None) -> str:
     @overload
     def cat(
         self, others, sep: str = None, na_rep: str = None
-    ) -> Union[ParentType, "cudf.core.column.StringColumn"]:
+    ) -> Union[ParentType, "cudf.core.column.string.StringColumn"]:
         ...
 
     def cat(self, others=None, sep=None, na_rep=None):

From 5d02c86e9bba6dedcdfd72e4f0b469ddebd1e6e3 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Mon, 8 Mar 2021 14:35:40 -0600
Subject: [PATCH 5/8] Change dask and distributed branch to main (#7532)

`dask` and `distributed` are changing their default branches name from `master` to `main`, this will break our dev environments and CI, this PR updates the required files.

`distributed` already merged the PR that does the change, `dask` will probably do the same very soon so a PR that updates both seems to be the best approach.

Authors:
  - Dante Gama Dessavre (@dantegd)

Approvers:
  - Keith Kraus (@kkraus14)
  - AJ Schmidt (@ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/7532
---
 ci/gpu/build.sh                          | 10 +++++-----
 conda/environments/cudf_dev_cuda10.1.yml |  4 ++--
 conda/environments/cudf_dev_cuda10.2.yml |  4 ++--
 conda/environments/cudf_dev_cuda11.0.yml |  4 ++--
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 316f9c5f98a..7614e19cc89 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -98,11 +98,11 @@ conda config --show-sources
 conda list --show-channel-urls
 
 function install_dask {
-    # Install the master version of dask, distributed, and streamz
-    gpuci_logger "Install the master version of dask, distributed, and streamz"
+    # Install the main version of dask, distributed, and streamz
+    gpuci_logger "Install the main version of dask, distributed, and streamz"
     set -x
-    pip install "git+https://github.com/dask/distributed.git@master" --upgrade --no-deps
-    pip install "git+https://github.com/dask/dask.git@master" --upgrade --no-deps
+    pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
     pip install "git+https://github.com/python-streamz/streamz.git" --upgrade --no-deps
     set +x
 }
@@ -152,7 +152,7 @@ else
     #Project Flash
     export LIB_BUILD_DIR="$WORKSPACE/ci/artifacts/cudf/cpu/libcudf_work/cpp/build"
     export LD_LIBRARY_PATH="$LIB_BUILD_DIR:$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
-    
+
     if hasArg --skip-tests; then
         gpuci_logger "Skipping Tests"
         exit 0
diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
index 3541ed1208c..35108ddd8ca 100644
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ b/conda/environments/cudf_dev_cuda10.1.yml
@@ -62,7 +62,7 @@ dependencies:
   - nvtx>=0.2.1
   - cachetools
   - pip:
-      - git+https://github.com/dask/dask.git@master
-      - git+https://github.com/dask/distributed.git@master
+      - git+https://github.com/dask/dask.git@main
+      - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git
       - pyorc
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
index 839533516fb..3a24e38a397 100644
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ b/conda/environments/cudf_dev_cuda10.2.yml
@@ -62,7 +62,7 @@ dependencies:
   - nvtx>=0.2.1
   - cachetools
   - pip:
-      - git+https://github.com/dask/dask.git@master
-      - git+https://github.com/dask/distributed.git@master
+      - git+https://github.com/dask/dask.git@main
+      - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git
       - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 401eaea63da..821c6f5320d 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -62,7 +62,7 @@ dependencies:
   - nvtx>=0.2.1
   - cachetools
   - pip:
-      - git+https://github.com/dask/dask.git@master
-      - git+https://github.com/dask/distributed.git@master
+      - git+https://github.com/dask/dask.git@main
+      - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git
       - pyorc

From 4897a25cbdf16e8bd758a3ad22ba5ef6785f9624 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 8 Mar 2021 20:13:07 -0500
Subject: [PATCH 6/8] Add gbenchmarks for strings extract function (#7522)

Reference #5698
This creates a gbenchmark for `cudf::strings::extract` function. The benchmarks measures various sized rows as well as strings lengths. It also has measurements for small, medium, and large regex instructions. The extract performance is effected by the number of instructions in the regex pattern.

Authors:
  - David (@davidwendt)

Approvers:
  - Keith Kraus (@kkraus14)
  - Karthikeyan (@karthikeyann)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7522
---
 cpp/benchmarks/CMakeLists.txt               |  1 +
 cpp/benchmarks/string/extract_benchmark.cpp | 75 +++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 cpp/benchmarks/string/extract_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5db32987624..c6dd055c887 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -183,6 +183,7 @@ ConfigureBench(STRINGS_BENCH
   string/convert_durations_benchmark.cpp
   string/convert_floats_benchmark.cpp
   string/copy_benchmark.cpp
+  string/extract_benchmark.cpp
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract_benchmark.cpp
new file mode 100644
index 00000000000..dbae18dde3b
--- /dev/null
+++ b/cpp/benchmarks/string/extract_benchmark.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/extract.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include "string_bench_args.hpp"
+
+class StringExtract : public cudf::benchmark {
+};
+
+static void BM_extract(benchmark::State& state, int re_instructions)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  std::string const raw_pattern =
+    "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
+    "5678901234567890123456789012345678901234567890";
+  std::string const pattern = "(" + raw_pattern.substr(0, re_instructions) + ")";
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    auto results = cudf::strings::extract(input, pattern);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name, instructions)          \
+  BENCHMARK_DEFINE_F(StringExtract, name)                     \
+  (::benchmark::State & st) { BM_extract(st, instructions); } \
+  BENCHMARK_REGISTER_F(StringExtract, name)                   \
+    ->Apply(generate_bench_args)                              \
+    ->UseManualTime()                                         \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(small, 4)
+STRINGS_BENCHMARK_DEFINE(medium, 48)
+STRINGS_BENCHMARK_DEFINE(large, 128)

From 444d9f2336dd69cd9aad99ec3c676332c5e43ec2 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 8 Mar 2021 21:35:12 -0500
Subject: [PATCH 7/8] Reduce compile time/size for scan.cu (#7516)

This PR reduces the number of calls to `inclusive_scan` and `exclusive_scan` by using a `null_replace_accessor` that allows non-nullable columns. This reduces the compile time and size of `scan.cu` by half. This PR also includes a scan gbenchmark that shows no change in performance from the original implementation.

Authors:
  - David (@davidwendt)

Approvers:
  - Paul Taylor (@trxcllnt)
  - Jake Hemstad (@jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/7516
---
 cpp/benchmarks/CMakeLists.txt               |  1 +
 cpp/benchmarks/reduction/scan_benchmark.cpp | 63 +++++++++++++++++++++
 cpp/include/cudf/detail/iterator.cuh        | 48 +++++++++-------
 cpp/src/reductions/scan.cu                  | 50 ++++++----------
 cpp/tests/iterator/value_iterator_test.cu   |  4 +-
 cpp/tests/reductions/scan_tests.cpp         | 29 ++++++++--
 6 files changed, 135 insertions(+), 60 deletions(-)
 create mode 100644 cpp/benchmarks/reduction/scan_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index c6dd055c887..dfc340b1459 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -115,6 +115,7 @@ ConfigureBench(REDUCTION_BENCH
   reduction/anyall_benchmark.cpp
   reduction/dictionary_benchmark.cpp
   reduction/reduce_benchmark.cpp
+  reduction/scan_benchmark.cpp
   reduction/minmax_benchmark.cpp)
 
 ###################################################################################################
diff --git a/cpp/benchmarks/reduction/scan_benchmark.cpp b/cpp/benchmarks/reduction/scan_benchmark.cpp
new file mode 100644
index 00000000000..b2d8fcfc004
--- /dev/null
+++ b/cpp/benchmarks/reduction/scan_benchmark.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+class ReductionScan : public cudf::benchmark {
+};
+
+template <typename type>
+static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const dtype = cudf::type_to_id<type>();
+  auto const table = create_random_table({dtype}, 1, row_count{n_rows});
+  if (!include_nulls) table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+  cudf::column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    auto result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+  }
+}
+
+#define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
+  BENCHMARK_DEFINE_F(ReductionScan, name)                                 \
+  (::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
+  BENCHMARK_REGISTER_F(ReductionScan, name)                               \
+    ->UseManualTime()                                                     \
+    ->Arg(10000)      /* 10k */                                           \
+    ->Arg(100000)     /* 100k */                                          \
+    ->Arg(1000000)    /* 1M */                                            \
+    ->Arg(10000000)   /* 10M */                                           \
+    ->Arg(100000000); /* 100M */
+
+SCAN_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
+SCAN_BENCHMARK_DEFINE(int32_no_nulls, int32_t, false);
+SCAN_BENCHMARK_DEFINE(uint64_no_nulls, uint64_t, false);
+SCAN_BENCHMARK_DEFINE(float_no_nulls, float, false);
+SCAN_BENCHMARK_DEFINE(int16_nulls, int16_t, true);
+SCAN_BENCHMARK_DEFINE(uint32_nulls, uint32_t, true);
+SCAN_BENCHMARK_DEFINE(double_nulls, double, true);
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 805cdc02bc6..881afa63ca5 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -71,16 +71,13 @@ inline auto make_counting_transform_iterator(cudf::size_type start, UnaryFunctio
 }
 
 /**
- * @brief value accessor of column with null bitmask
- * A unary functor returns scalar value at `id`.
- * `operator() (cudf::size_type id)` computes `element` and valid flag at `id`
- * This functor is only allowed for nullable columns.
+ * @brief Value accessor of column that may have a null bitmask.
  *
- * the return value for element `i` will return `column[i]`
- * if it is valid, or `null_replacement` if it is null.
+ * This unary functor returns scalar value at `id`.
+ * The `operator()(cudf::size_type id)` computes the `element` and valid flag at `id`.
  *
- * @throws cudf::logic_error if the column is not nullable.
- * @throws cudf::logic_error if column datatype and Element type mismatch.
+ * The return value for element `i` will return `column[i]`
+ * if it is valid, or `null_replacement` if it is null.
  *
  * @tparam Element The type of elements in the column
  */
@@ -88,24 +85,33 @@ template <typename Element>
 struct null_replaced_value_accessor {
   column_device_view const col;      ///< column view of column in device
   Element const null_replacement{};  ///< value returned when element is null
+  bool const has_nulls;              ///< true if col has null elements
 
   /**
-   * @brief constructor
-   * @param[in] _col column device view of cudf column
+   * @brief Creates an accessor for a null-replacement iterator.
+   *
+   * @throws cudf::logic_error if `col` type does not match Element type.
+   * @throws cudf::logic_error if `has_nulls` is true but `col` does not have a validity mask.
+   *
+   * @param[in] col column device view of cudf column
    * @param[in] null_replacement The value to return for null elements
+   * @param[in] has_nulls Must be set to true if `col` has nulls.
    */
-  null_replaced_value_accessor(column_device_view const& _col, Element null_val)
-    : col{_col}, null_replacement{null_val}
+  null_replaced_value_accessor(column_device_view const& col,
+                               Element null_val,
+                               bool has_nulls = true)
+    : col{col}, null_replacement{null_val}, has_nulls{has_nulls}
   {
-    CUDF_EXPECTS(data_type(type_to_id<Element>()) == col.type(), "the data type mismatch");
-    // verify valid is non-null, otherwise, is_valid_nocheck() will crash
-    CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column.");
+    CUDF_EXPECTS(type_to_id<Element>() == device_storage_type_id(col.type().id()),
+                 "the data type mismatch");
+    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
+    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
   CUDA_DEVICE_CALLABLE
   Element operator()(cudf::size_type i) const
   {
-    return col.is_valid_nocheck(i) ? col.element<Element>(i) : null_replacement;
+    return has_nulls && col.is_null_nocheck(i) ? null_replacement : col.element<Element>(i);
   }
 };
 
@@ -140,7 +146,7 @@ struct validity_accessor {
  *
  * Dereferencing the returned iterator for element `i` will return `column[i]`
  * if it is valid, or `null_replacement` if it is null.
- * This iterator is only allowed for nullable columns.
+ * This iterator is only allowed for both nullable and non-nullable columns.
  *
  * @throws cudf::logic_error if the column is not nullable.
  * @throws cudf::logic_error if column datatype and Element type mismatch.
@@ -148,15 +154,17 @@ struct validity_accessor {
  * @tparam Element The type of elements in the column
  * @param column The column to iterate
  * @param null_replacement The value to return for null elements
- * @return auto Iterator that returns valid column elements, or a null
+ * @param has_nulls Must be set to true if `column` has nulls.
+ * @return Iterator that returns valid column elements, or a null
  * replacement value for null elements.
  */
 template <typename Element>
 auto make_null_replacement_iterator(column_device_view const& column,
-                                    Element const null_replacement = Element{0})
+                                    Element const null_replacement = Element{0},
+                                    bool has_nulls                 = true)
 {
   return make_counting_transform_iterator(
-    0, null_replaced_value_accessor<Element>{column, null_replacement});
+    0, null_replaced_value_accessor<Element>{column, null_replacement, has_nulls});
 }
 
 /**
diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu
index f73ffb0214a..c3aadf47794 100644
--- a/cpp/src/reductions/scan.cu
+++ b/cpp/src/reductions/scan.cu
@@ -21,11 +21,10 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -34,6 +33,7 @@
 
 namespace cudf {
 namespace detail {
+
 /**
  * @brief Dispatcher for running Scan operation on input column
  * Dispatches scan operation on `Op` and creates output column
@@ -73,23 +73,14 @@ struct scan_dispatcher {
     mutable_column_view output = output_column->mutable_view();
     auto d_input               = column_device_view::create(input_view, stream);
 
-    if (input_view.has_nulls()) {
-      auto input = make_null_replacement_iterator(*d_input, Op::template identity<T>());
-      thrust::exclusive_scan(rmm::exec_policy(stream),
-                             input,
-                             input + size,
-                             output.data<T>(),
-                             Op::template identity<T>(),
-                             Op{});
-    } else {
-      auto input = d_input->begin<T>();
-      thrust::exclusive_scan(rmm::exec_policy(stream),
-                             input,
-                             input + size,
-                             output.data<T>(),
-                             Op::template identity<T>(),
-                             Op{});
-    }
+    auto input =
+      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
+    thrust::exclusive_scan(rmm::exec_policy(stream),
+                           input,
+                           input + size,
+                           output.data<T>(),
+                           Op::template identity<T>(),
+                           Op{});
 
     CHECK_CUDA(stream.value());
     return output_column;
@@ -147,13 +138,9 @@ struct scan_dispatcher {
     auto d_input               = column_device_view::create(input_view, stream);
     mutable_column_view output = output_column->mutable_view();
 
-    if (input_view.has_nulls()) {
-      auto input = make_null_replacement_iterator(*d_input, Op::template identity<T>());
-      thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, output.data<T>(), Op{});
-    } else {
-      auto input = d_input->begin<T>();
-      thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, output.data<T>(), Op{});
-    }
+    auto const input =
+      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
+    thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, output.data<T>(), Op{});
 
     CHECK_CUDA(stream.value());
     return output_column;
@@ -171,13 +158,10 @@ struct scan_dispatcher {
 
     auto d_input = column_device_view::create(input_view, stream);
 
-    if (input_view.has_nulls()) {
-      auto input = make_null_replacement_iterator(*d_input, Op::template identity<T>());
-      thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, result.data(), Op{});
-    } else {
-      auto input = d_input->begin<T>();
-      thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, result.data(), Op{});
-    }
+    auto input =
+      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
+    thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, result.data(), Op{});
+
     CHECK_CUDA(stream.value());
 
     auto output_column =
diff --git a/cpp/tests/iterator/value_iterator_test.cu b/cpp/tests/iterator/value_iterator_test.cu
index 3ad7ac6d0cd..542123ffd25 100644
--- a/cpp/tests/iterator/value_iterator_test.cu
+++ b/cpp/tests/iterator/value_iterator_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -343,7 +343,7 @@ TYPED_TEST(IteratorTest, error_handling)
 
   CUDF_EXPECT_THROW_MESSAGE((cudf::detail::make_null_replacement_iterator(
                               *d_col_no_null, cudf::test::make_type_param_scalar<T>(0))),
-                            "Unexpected non-nullable column.");
+                            "column with nulls must have a validity bitmask");
 
   CUDF_EXPECT_THROW_MESSAGE((d_col_no_null->pair_begin<T, true>()),
                             "Unexpected non-nullable column.");
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 549e5e0d215..8372b3977c0 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -509,8 +509,13 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointScanSum)
     auto const column   = fp_wrapper{{1, 2, 3, 4}, scale};
     auto const expected = fp_wrapper{{1, 3, 6, 10}, scale};
     auto const result   = cudf::scan(column, cudf::make_sum_aggregation(), scan_type::INCLUSIVE);
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+
+    auto const with_nulls     = fp_wrapper({1, 2, 3, 0, 4, 0}, {1, 1, 1, 0, 1, 0}, scale);
+    auto const expected_nulls = fp_wrapper({1, 3, 6, 0, 10, 0}, {1, 1, 1, 0, 1, 0}, scale);
+    auto const result_nulls =
+      cudf::scan(with_nulls, cudf::make_sum_aggregation(), scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_nulls->view(), expected_nulls);
   }
 }
 
@@ -526,8 +531,13 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointPreScanSum)
     auto const column   = fp_wrapper{{1, 2, 3, 4}, scale};
     auto const expected = fp_wrapper{{0, 1, 3, 6}, scale};
     auto const result   = cudf::scan(column, cudf::make_sum_aggregation(), scan_type::EXCLUSIVE);
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+
+    auto const with_nulls     = fp_wrapper({0, 1, 2, 3, 0, 4}, {0, 1, 1, 1, 0, 1}, scale);
+    auto const expected_nulls = fp_wrapper({0, 0, 1, 3, 0, 6}, {0, 1, 1, 1, 0, 1}, scale);
+    auto const result_nulls =
+      cudf::scan(with_nulls, cudf::make_sum_aggregation(), scan_type::EXCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_nulls->view(), expected_nulls);
   }
 }
 
@@ -556,8 +566,13 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointScanMin)
     auto const column   = fp_wrapper{{1, 2, 3, 4}, scale};
     auto const expected = fp_wrapper{{1, 1, 1, 1}, scale};
     auto const result   = cudf::scan(column, cudf::make_min_aggregation(), scan_type::INCLUSIVE);
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+
+    auto const with_nulls     = fp_wrapper({1, 0, 2, 0, 3, 4}, {1, 0, 1, 0, 1, 1}, scale);
+    auto const expected_nulls = fp_wrapper({1, 0, 1, 0, 1, 1}, {1, 0, 1, 0, 1, 1}, scale);
+    auto const result_nulls =
+      cudf::scan(with_nulls, cudf::make_min_aggregation(), scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_nulls->view(), expected_nulls);
   }
 }
 
@@ -572,7 +587,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointScanMax)
     auto const scale  = scale_type{i};
     auto const column = fp_wrapper{{1, 2, 3, 4}, scale};
     auto const result = cudf::scan(column, cudf::make_max_aggregation(), scan_type::INCLUSIVE);
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), column);
+
+    auto const with_nulls = fp_wrapper({1, 0, 0, 2, 3, 4}, {1, 0, 0, 1, 1, 1}, scale);
+    auto const result_nulls =
+      cudf::scan(with_nulls, cudf::make_max_aggregation(), scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_nulls->view(), with_nulls);
   }
 }

From 8b2ae92a54367c0cdb2c9afe2c98b40f643b708a Mon Sep 17 00:00:00 2001
From: Keith Kraus <kkraus@nvidia.com>
Date: Mon, 8 Mar 2021 23:20:29 -0500
Subject: [PATCH 8/8] fix missing renames of dask git branches from master to
 main (#7535)

There were a few renames of master --> main that were missed for the recent dask branch rename, fixed them.

Authors:
  - Keith Kraus (@kkraus14)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7535
---
 ci/benchmark/build.sh               | 8 ++++----
 conda/recipes/dask-cudf/run_test.sh | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index a9398f4527c..8dd133c8fa3 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -75,10 +75,10 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
 # conda install "your-pkg=1.0.0"
 
 # Install the master version of dask, distributed, and streamz
-logger "pip install git+https://github.com/dask/distributed.git@master --upgrade --no-deps"
-pip install "git+https://github.com/dask/distributed.git@master" --upgrade --no-deps
-logger "pip install git+https://github.com/dask/dask.git@master --upgrade --no-deps"
-pip install "git+https://github.com/dask/dask.git@master" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 logger "pip install git+https://github.com/python-streamz/streamz.git --upgrade --no-deps"
 pip install "git+https://github.com/python-streamz/streamz.git" --upgrade --no-deps
 
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index 0fc29d42721..3fc1182b33b 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -9,11 +9,11 @@ function logger() {
 }
 
 # Install the latest version of dask and distributed
-logger "pip install git+https://github.com/dask/distributed.git@master --upgrade --no-deps"
-pip install "git+https://github.com/dask/distributed.git@master" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
 
-logger "pip install git+https://github.com/dask/dask.git@master --upgrade --no-deps"
-pip install "git+https://github.com/dask/dask.git@master" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 
 logger "python -c 'import dask_cudf'"
 python -c "import dask_cudf"