Merge branch 'branch-0.4' into decimal_divide

NVIDIA · Feb 4, 2021 · 3352744 · 3352744
2 parents 98c5562 + a600bb7
commit 3352744
Show file tree

Hide file tree

Showing 96 changed files with 4,232 additions and 603 deletions.
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,102 +12,120 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# A workflow to trigger blossom-CI on self-hosted runner
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
 name: Blossom-CI
 on:
   issue_comment:
     types: [created]
-
+  workflow_dispatch:
+    inputs:
+      platform:
+        description: 'runs-on argument'
+        required: false
+      args:
+        description: 'argument'
+        required: false
 jobs:
-  authorization:
+  Authorization:
     name: Authorization
-    # trigger on pre-defined text
-    if: github.event.comment.body == 'build'
-    runs-on: [self-hosted, linux, blossom]
+    runs-on: blossom
+    outputs:
+      args: ${{ env.args }}
+
+    # This job only runs for pull request comments
+    if: contains( '\
+      abellina,\
+      andygrove,\
+      anfeng,\
+      firestarman,\
+      GaryShen2008,\
+      jlowe,\
+      krajendrannv,\
+      kuhushukla,\
+      mythrocks,\
+      nartal1,\
+      nvdbaranec,\
+      NvTimLiu,\
+      razajafri,\
+      revans2,\
+      rongou,\
+      rwlee,\
+      sameerz,\
+      shotai,\
+      sriramch,\
+      tgravescs,\
+      wbo4958,\
+      wjxiz1992,\
+      yuange98,\
+      sperlingxx,\
+      pxLi,\
+      hyperbolic2346,\
+      gerashegalov,\
+      ttnghia,\
+      ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci
         env:
           OPERATION: 'AUTH'
-          VERSION: '1'
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
 
-  vulnerability-scan-job:
-    name: Vulnerability scan job
-    needs: [authorization]
+  Vulnerability-scan:
+    name: Vulnerability scan
+    needs: [Authorization]
     runs-on: ubuntu-latest
     steps:
-      - name: Get pull request data
-        id: pull_request_data
-        uses: octokit/[email protected]
-        with:
-          route: GET /repos/${{ github.repository }}/pulls/{issue_id}
-          issue_id: ${{ github.event.issue.number }}
-        env:
-          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
-
-      - name: Set blackduck project version
-        id: blackduck-project-version
-        env:
-          REF: ${{ fromJson(steps.pull_request_data.outputs.data).head.ref }}
-        run: echo "$REF-${{ github.run_id }}"
-
-      - name: Update status
-        uses: octokit/[email protected]
-        with:
-          route: POST /repos/${{ github.repository }}/statuses/{sha}
-          sha: ${{ fromJson(steps.pull_request_data.outputs.data).head.sha }}
-          target_url: "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-          description: "vulnerability scan running"
-          state: "pending"
-          context: "blossom-ci"
-        env:
-          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
-
       - name: Checkout code
         uses: actions/checkout@v2
         with:
-          repository: ${{ fromJson(steps.pull_request_data.outputs.data).head.repo.full_name }}
-          ref: ${{ fromJson(steps.pull_request_data.outputs.data).head.ref }}
+          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
+          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
           lfs: 'true'
 
+      # repo specific steps
       - name: Setup java
         uses: actions/setup-java@v1
         with:
           java-version: 1.8
 
-      - name: Get project data (maven)
-        run: echo "PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')" >> $GITHUB_ENV
-
-      - name: Add mask
-        run: echo "::add-mask::${{ secrets.BLACKDUCK_URL }}"
+      # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
+      - name: Setup blackduck properties
+        run: |
+          PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
+          echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
+          echo detect.maven.included.scopes=compile >> application.properties
 
-      - name: Run synopsys detect
-        id: scan_result
-        uses: blackducksoftware/[email protected]
+      - name: Run blossom action
+        uses: ravitestgit/blossom-action@main
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
         with:
-          args: >
-            --blackduck.url="https://${{ secrets.BLACKDUCK_URL }}"
-            --blackduck.api.token="${{ secrets.BLACKDUCK_API_TOKEN }}"
-            --detect.maven.build.command="-pl='$PROJECTS -am'"
-            --detect.maven.included.scopes=compile
-            --detect.force.success=false
-            --detect.tools.excluded=SIGNATURE_SCAN
-            --detect.parallel.processors=0
-            --detect.project.name="${{ github.repository }}"
-            --detect.project.version.name="${{ github.run_id }}"
+          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
+          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
+          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
+
+  Job-trigger:
+    name: Start ci job
+    needs: [Vulnerability-scan]
+    runs-on: blossom
+    steps:
+      - name: Start ci job
+        run: blossom-ci
+        env:
+          OPERATION: 'START-CI-JOB'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-  vulnerability-check-trigger:
-    name: Vulnerability check & start ci job
-    needs: [vulnerability-scan-job]
-    runs-on: [self-hosted, linux, blossom]
+  Post-processing:
+    name: Post processing
+    runs-on: blossom
+    if : github.event_name == 'workflow_dispatch'
     steps:
-      - name: Check for new issue in vulnerability scan & start ci job
+      - name: Start post processing
         run: blossom-ci
         env:
-          OPERATION: 'SCAN-CHECK-CI-JOB-START'
-          VERSION: '1'
-          BLACKDUCK_TOKEN: "${{ secrets.BLACKDUCK_API_TOKEN }}"
-          BLACKDUCK_URL: "${{ secrets.BLACKDUCK_URL }}"
-          BLACKDUCK_PROJECT_VERSION: "${{ github.run_id }}"
+          OPERATION: 'POST-PROCESSING'
           CI_SERVER: ${{ secrets.CI_SERVER }}
           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -17,6 +17,29 @@ Contributions to RAPIDS Accelerator for Apache Spark fall into the following thr
     follow the [code contributions](#code-contributions) guide below. If you
     need more context on a particular issue, please ask in a comment.
 
+## Branching Convention
+
+There are two types of branches in this repository:
+
+* `branch-[version]`: are development branches which can change often. Note that we merge into
+  the branch with the greatest version number, as that is our default branch.
+
+* `main`: is the branch with the latest released code, and the version tag (i.e. `v0.1.0`)
+  is held here. `main` will change with new releases, but otherwise it should not change with
+  every pull request merged, making it a more stable branch.
+
+## Building From Source
+
+We use [Maven](https://maven.apache.org) for most aspects of the build. Some important parts
+of the build execute in the `verify` phase of the Maven build lifecycle.  We recommend when
+building at least running to the `verify` phase, e.g.:
+
+```shell script
+mvn verify
+```
+
+After a successful build the RAPIDS Accelerator jar will be in the `dist/target/` directory.
+
 ## Code contributions
 
 ### Your first issue

diff --git a/README.md b/README.md
@@ -42,23 +42,9 @@ may file one [here](https://github.com/NVIDIA/spark-rapids/issues/new/choose).
 The jar files for the most recent release can be retrieved from the [download](docs/download.md)
 page. 
 
-## Build
+## Building From Source
 
-There are two types of branches in this repository:
-
-* `branch-[version]`: are development branches which can change often. Note that we merge into 
-  the branch with the greatest version number, as that is our default branch.
-
-* `main`: is the branch with the latest released code, and the version tag (i.e. `v0.1.0`)
-  is held here. `main` will change with new releases, but otherwise it should not change with
-  every pull request merged, making it a more stable branch.
-
-We use maven for most aspects of the build. Some important parts of the build execute in
-the "verify" phase of maven.  We recommend when building at least running to the "verify" phase.   
-
-```shell script
-mvn verify
-```
+See the [build instructions in the contributing guide](CONTRIBUTING.md#building-from-source).
 
 ## Testing 
 

diff --git a/docs/FAQ.md b/docs/FAQ.md
@@ -1,7 +1,7 @@
 ---
 layout: page
 title: Frequently Asked Questions
-nav_order: 10
+nav_order: 11
 ---
 # Frequently Asked Questions
 
@@ -17,6 +17,87 @@ with `collect`, `show` or `write` a new `DataFrame` is constructed causing Spark
 query. This is why `spark.rapids.sql.enabled` is still respected when running, even if explain shows
 stale results.
 
+### How can I tell what will run on the GPU and what will not run on it?
+<a name="explain"></a>
+
+An Apache Spark plan is transformed and optimized into a set of operators called a physical plan.
+This plan is then run through a set of rules to translate it to a version that runs on the GPU.
+If you want to know what will run on the GPU and what will not along with an explanation why you
+can set [spark.rapids.sql.explain](configs.md#sql.explain) to `ALL`. If you just want to see the
+operators not on the GPU you may set it to `NOT_ON_GPU`. Be aware that some queries end up being
+broken down into multiple jobs, and in those cases a separate log message might be output for each
+job. These are logged each time a query is compiled into an `RDD`, not just when the job runs.
+Because of this calling `explain` on a DataFrame will also trigger this to be logged.
+
+The format of each line follows the pattern
+```
+indicator operation<NAME> operator? explanation
+```
+
+In this `indicator` is one of the following
+  * `*` for operations that will run on the GPU
+  * `@` for operations that could run on the GPU but will not because they are a part of a larger
+    section of the plan that will not run on the GPU
+  * `#` for operations that have been removed from the plan. The reason they are removed will be
+    in the explanation.
+  * `!` for operations that cannot run on the GPU
+
+`operation` indicates the type of the operator.
+  * `Expression` These are typically functions that operate on columns of data and produce a column
+    of data.
+  * `Exec` These are higher level operations that operate on an entire table at a time.
+  * `Partitioning` These are different types of partitioning used when reorganizing data to move to
+    different tasks.
+  * `Input` These are different input formats used with a few input statements, but not all.
+  * `Output` These are different output formats used with a few output statements, but not all.
+  * `NOT_FOUND` These are for anything that the plugin has no replacement rule for.
+
+`NAME` is the name of the operator given by Spark.
+
+`operator?` is an optional string representation of the operator given by Spark.
+
+`explanation` is a text explanation saying if this will
+  * run on the GPU
+  * could run on the GPU but will not because of something outside this operator and an
+    explanation why
+  * will not run on the GPU with an explanation why
+  * will be removed from the plan with a reason why
+
+Generally if an operator is not compatible with Spark for some reason and is off the explanation
+will include information about how it is incompatible and what configs to set to enable the
+operator if you can accept the incompatibility.
+
+### Why does the plan for the GPU query look different from the CPU query?
+
+Typically, there is a one to one mapping between CPU stages in a plan and GPU stages.  There are a
+few places where this is not the case.
+
+* `WholeStageCodeGen` - The GPU plan typically does not do code generation, and does not support 
+  generating code for an entire stage in the plan. Code generation reduces the cost of processing 
+  data one row at a time. The GPU plan processes the data in a columnar format, so the costs 
+  of processing a batch is amortized over the entire batch of data and code generation is not
+  needed.
+
+* `ColumnarToRow` and `RowToColumnar` transitions - The CPU version of Spark plans typically process 
+  data in a row based format. The main exception to this is reading some kinds of columnar data, 
+  like Parquet. Transitioning between the CPU and the GPU also requires transitioning between row
+  and columnar formatted data.
+
+* `GpuCoalesceBatches` and `GpuShuffleCoalesce` - Processing data on the GPU scales 
+  sublinearly. That means doubling the data does often takes less than half the time. Because of
+  this we want to process larger batches of data when possible. These operators will try to combine
+  smaller batches of data into fewer, larger batches to process more efficiently.
+
+* `SortMergeJoin` - The RAPIDS accelerator does not support sort merge joins yet. For now, we 
+  translate sort merge joins into shuffled hash joins. Because of this there are times when sorts 
+  may be removed or other sorts added to meet the ordering requirements of the query.
+
+* `TakeOrderedAndProject` - The `TakeOrderedAndProject` operator will take the top N entries in 
+  each task, shuffle the results to a single executor and then take the top N results from that. 
+  The GPU plan often has more metrics than the CPU versions do, and when we tried to combine all of 
+  these operations into a single stage the metrics were confusing to understand. Instead, we split 
+  the single stage up into multiple smaller parts, so the metrics are clearer.
+
 ### What versions of Apache Spark does the RAPIDS Accelerator for Apache Spark support?
 
 The RAPIDS Accelerator for Apache Spark requires version 3.0.0 or 3.0.1 of Apache Spark. Because the
@@ -166,15 +247,46 @@ the I/O and starting the initial processing can suffer.  But if you have a lot o
 cannot be done on the GPU, like complex UDFs, the more tasks you have the more CPU processing you
 can throw at it.
 
+### Why are multiple GPUs per executor not supported?
+
+The RAPIDS Accelerator only supports a single GPU per executor because that was a limitation of
+[RAPIDS cudf](https://github.com/rapidsai/cudf), the foundation of the Accelerator. Basic support
+for working with multiple GPUs has only recently been added to RAPIDS cudf, and there are no plans
+for its individual operations to leverage multiple GPUs (e.g.: a single task's join operation
+processed by multiple GPUs).
+
+Many Spark setups avoid allocating too many concurrent tasks to the same executor, and often
+multiple executors are run per node on the cluster. Therefore this feature has not been
+prioritized, as there has not been a compelling use-case that requires it.
+
+### Why are multiple executors per GPU not supported?
+
+There are multiple reasons why this a problematic configuration:
+- Apache Spark does not support scheduling a fractional number of GPUs to an executor
+- CUDA context switches between processes sharing a single GPU can be expensive
+- Each executor would have a fraction of the GPU memory available for processing
+
+### Is [Multi Instance GPU (MIG)](https://docs.nvidia.com/cuda/mig/index.html) supported?
+
+Yes, but it requires support from the underlying cluster manager to isolate the MIG GPU instance
+for each executor (e.g.: by setting `CUDA_VISIBLE_DEVICES` or other means).
+
+Note that MIG is not recommended for use with the RAPIDS Accelerator since it significantly
+reduces the amount of GPU memory that can be used by the Accelerator for each executor instance.
+If the cluster is purpose-built to run Spark with the RAPIDS Accelerator then we recommend running
+without MIG. Also note that the UCX-based shuffle plugin will not work as well in this
+configuration because
+[MIG does not support direct GPU to GPU transfers](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#app-considerations).
+
 ### How can I run custom expressions/UDFs on the GPU?
 
 The RAPIDS Accelerator provides the following solutions for running
 user-defined functions on the GPU:
 
-#### RAPIDS-Accelerated UDFs
+#### RAPIDS Accelerated UDFs
 
-UDFs can provide a RAPIDS-accelerated implementation which allows the RAPIDS Accelerator to perform
-the operation on the GPU.  See the [RAPIDS-accelerated UDF documentation](../docs/rapids-udfs.md)
+UDFs can provide a RAPIDS accelerated implementation which allows the RAPIDS Accelerator to perform
+the operation on the GPU.  See the [RAPIDS accelerated UDF documentation](additional-functionality/rapids-udfs.md)
 for details.
 
 #### Automatic Translation of Scala UDFs to Apache Spark Operations