Merge branch 'apache:main' into dev

comphead · Jun 18, 2024 · 0a7e93f · 0a7e93f
2 parents 9fb05c6 + a2c9d1a
commit 0a7e93f
Show file tree

Hide file tree

Showing 346 changed files with 22,232 additions and 13,510 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -52,8 +52,8 @@ homepage = "https://datafusion.apache.org"
 license = "Apache-2.0"
 readme = "README.md"
 repository = "https://github.com/apache/datafusion"
-rust-version = "1.73"
-version = "38.0.0"
+rust-version = "1.75"
+version = "39.0.0"
 
 [workspace.dependencies]
 # We turn off default-features for some dependencies here so the workspaces which inherit them can
@@ -64,54 +64,66 @@ version = "38.0.0"
 ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }
-arrow = { version = "51.0.0", features = ["prettyprint"] }
-arrow-array = { version = "51.0.0", default-features = false, features = ["chrono-tz"] }
-arrow-buffer = { version = "51.0.0", default-features = false }
-arrow-flight = { version = "51.0.0", features = ["flight-sql-experimental"] }
-arrow-ipc = { version = "51.0.0", default-features = false, features = ["lz4"] }
-arrow-ord = { version = "51.0.0", default-features = false }
-arrow-schema = { version = "51.0.0", default-features = false }
-arrow-string = { version = "51.0.0", default-features = false }
+arrow = { version = "52.0.0", features = [
+    "prettyprint",
+] }
+arrow-array = { version = "52.0.0", default-features = false, features = [
+    "chrono-tz",
+] }
+arrow-buffer = { version = "52.0.0", default-features = false }
+arrow-flight = { version = "52.0.0", features = [
+    "flight-sql-experimental",
+] }
+arrow-ipc = { version = "52.0.0", default-features = false, features = [
+    "lz4",
+] }
+arrow-ord = { version = "52.0.0", default-features = false }
+arrow-schema = { version = "52.0.0", default-features = false }
+arrow-string = { version = "52.0.0", default-features = false }
 async-trait = "0.1.73"
 bigdecimal = "=0.4.1"
 bytes = "1.4"
 chrono = { version = "0.4.34", default-features = false }
 ctor = "0.2.0"
-dashmap = "5.4.0"
-datafusion = { path = "datafusion/core", version = "38.0.0", default-features = false }
-datafusion-common = { path = "datafusion/common", version = "38.0.0", default-features = false }
-datafusion-common-runtime = { path = "datafusion/common-runtime", version = "38.0.0" }
-datafusion-execution = { path = "datafusion/execution", version = "38.0.0" }
-datafusion-expr = { path = "datafusion/expr", version = "38.0.0" }
-datafusion-functions = { path = "datafusion/functions", version = "38.0.0" }
-datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "38.0.0" }
-datafusion-functions-array = { path = "datafusion/functions-array", version = "38.0.0" }
-datafusion-optimizer = { path = "datafusion/optimizer", version = "38.0.0", default-features = false }
-datafusion-physical-expr = { path = "datafusion/physical-expr", version = "38.0.0", default-features = false }
-datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "38.0.0", default-features = false }
-datafusion-physical-plan = { path = "datafusion/physical-plan", version = "38.0.0" }
-datafusion-proto = { path = "datafusion/proto", version = "38.0.0" }
-datafusion-proto-common = { path = "datafusion/proto-common", version = "38.0.0" }
-datafusion-sql = { path = "datafusion/sql", version = "38.0.0" }
-datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "38.0.0" }
-datafusion-substrait = { path = "datafusion/substrait", version = "38.0.0" }
+dashmap = "5.5.0"
+datafusion = { path = "datafusion/core", version = "39.0.0", default-features = false }
+datafusion-common = { path = "datafusion/common", version = "39.0.0", default-features = false }
+datafusion-common-runtime = { path = "datafusion/common-runtime", version = "39.0.0" }
+datafusion-execution = { path = "datafusion/execution", version = "39.0.0" }
+datafusion-expr = { path = "datafusion/expr", version = "39.0.0" }
+datafusion-functions = { path = "datafusion/functions", version = "39.0.0" }
+datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "39.0.0" }
+datafusion-functions-array = { path = "datafusion/functions-array", version = "39.0.0" }
+datafusion-optimizer = { path = "datafusion/optimizer", version = "39.0.0", default-features = false }
+datafusion-physical-expr = { path = "datafusion/physical-expr", version = "39.0.0", default-features = false }
+datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "39.0.0", default-features = false }
+datafusion-physical-plan = { path = "datafusion/physical-plan", version = "39.0.0" }
+datafusion-proto = { path = "datafusion/proto", version = "39.0.0" }
+datafusion-proto-common = { path = "datafusion/proto-common", version = "39.0.0" }
+datafusion-sql = { path = "datafusion/sql", version = "39.0.0" }
+datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "39.0.0" }
+datafusion-substrait = { path = "datafusion/substrait", version = "39.0.0" }
 doc-comment = "0.3"
 env_logger = "0.11"
 futures = "0.3"
 half = { version = "2.2.1", default-features = false }
-hashbrown = { version = "0.14", features = ["raw"] }
+hashbrown = { version = "0.14.5", features = ["raw"] }
 indexmap = "2.0.0"
 itertools = "0.12"
 log = "^0.4"
 num_cpus = "1.13.0"
-object_store = { version = "0.9.1", default-features = false }
+object_store = { version = "0.10.1", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "51.0.0", default-features = false, features = ["arrow", "async", "object_store"] }
+parquet = { version = "52.0.0", default-features = false, features = [
+    "arrow",
+    "async",
+    "object_store",
+] }
 rand = "0.8"
 regex = "1.8"
-rstest = "0.19.0"
+rstest = "0.21.0"
 serde_json = "1"
-sqlparser = { version = "0.45.0", features = ["visitor"] }
+sqlparser = { version = "0.47", features = ["visitor"] }
 tempfile = "3"
 thiserror = "1.0.44"
 tokio = { version = "1.36", features = ["macros", "rt", "sync"] }

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
@@ -1,2 +1,3 @@
 data
-results
+results
+venv
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -67,6 +67,13 @@ Create / download a specific dataset (TPCH)
 
 Data is placed in the `data` subdirectory.
 
+## Select join algorithm
+The benchmark runs with `prefer_hash_join == true` by default, which enforces HASH join algorithm.
+To run TPCH benchmarks with join other than HASH:
+```shell
+PREFER_HASH_JOIN=false ./bench.sh run tpch
+```
+
 ## Comparing performance of main and a branch
 
 ```shell
@@ -177,7 +184,6 @@ The benchmark program also supports CSV and Parquet input file formats and a uti
 ```bash
 cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet
 ```
-
 Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
 
 ### Comparing results between runs
@@ -261,7 +267,7 @@ SUBCOMMANDS:
 
 # Benchmarks
 
-The output of `dfbench` help includes a descripion of each benchmark, which is reproducedd here for convenience
+The output of `dfbench` help includes a description of each benchmark, which is reproduced here for convenience
 
 ## ClickBench
 

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -36,6 +36,8 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 #CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --profile release-nonlto"}  # for faster iterations
+PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
+VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
 
 usage() {
     echo "
@@ -45,30 +47,32 @@ Usage:
 $0 data [benchmark]
 $0 run [benchmark]
 $0 compare <branch1> <branch2>
+$0 venv
 
 **********
 Examples:
 **********
 # Create the datasets for all benchmarks in $DATA_DIR
 ./bench.sh data
 
-# Run the 'tpch' benchmark on the datafusion checkout in /source/arrow-datafusion
-DATAFUSION_DIR=/source/arrow-datafusion ./bench.sh run tpch
+# Run the 'tpch' benchmark on the datafusion checkout in /source/datafusion
+DATAFUSION_DIR=/source/datafusion ./bench.sh run tpch
 
 **********
 * Commands
 **********
 data:         Generates or downloads data needed for benchmarking
 run:          Runs the named benchmark
 compare:      Compares results from benchmark runs
+venv:         Creates new venv (unless already exists) and installs compare's requirements into it
 
 **********
 * Benchmarks
 **********
 all(default): Data/Run/Compare for all benchmarks
-tpch:                   TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
+tpch:                   TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
 tpch_mem:               TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
-tpch10:                 TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
+tpch10:                 TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table, hash join
 tpch_mem10:             TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
 parquet:                Benchmark of parquet reader's filtering speed
 sort:                   Benchmark of sorting speed
@@ -79,10 +83,12 @@ clickbench_extended:    ClickBench "inspired" queries against a single parquet (
 **********
 * Supported Configuration (Environment Variables)
 **********
-DATA_DIR        directory to store datasets
-CARGO_COMMAND   command that runs the benchmark binary
-DATAFUSION_DIR  directory to use (default $DATAFUSION_DIR)
-RESULTS_NAME    folder where the benchmark files are stored
+DATA_DIR            directory to store datasets
+CARGO_COMMAND       command that runs the benchmark binary
+DATAFUSION_DIR      directory to use (default $DATAFUSION_DIR)
+RESULTS_NAME        folder where the benchmark files are stored
+PREFER_HASH_JOIN    Prefer hash join algorithm (default true)
+VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
 "
     exit 1
 }
@@ -129,6 +135,7 @@ main() {
             echo "BENCHMARK: ${BENCHMARK}"
             echo "DATA_DIR: ${DATA_DIR}"
             echo "CARGO_COMMAND: ${CARGO_COMMAND}"
+            echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
             echo "***************************"
             case "$BENCHMARK" in
                 all)
@@ -183,6 +190,7 @@ main() {
             echo "DATA_DIR: ${DATA_DIR}"
             echo "RESULTS_DIR: ${RESULTS_DIR}"
             echo "CARGO_COMMAND: ${CARGO_COMMAND}"
+            echo "PREFER_HASH_JOIN": ${PREFER_HASH_JOIN}
             echo "***************************"
 
             # navigate to the appropriate directory
@@ -213,12 +221,6 @@ main() {
                 tpch_mem10)
                     run_tpch_mem "10"
                     ;;
-                tpch_smj)
-                    run_tpch_smj "1"
-                    ;;    
-                tpch_smj10)
-                    run_tpch_smj "10"
-                    ;;                      
                 parquet)
                     run_parquet
                     ;;
@@ -245,6 +247,9 @@ main() {
         compare)
             compare_benchmarks "$ARG2" "$ARG3"
             ;;
+        venv)
+            setup_venv
+            ;;
         "")
             usage
             ;;
@@ -321,22 +326,7 @@ run_tpch() {
     RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch benchmark..."
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE}
-}
-
-# Runs the tpch benchmark with sort merge join
-run_tpch_smj() {
-    SCALE_FACTOR=$1
-    if [ -z "$SCALE_FACTOR" ] ; then
-        echo "Internal error: Scale factor not specified"
-        exit 1
-    fi
-    TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-
-    RESULTS_FILE="${RESULTS_DIR}/tpch_smj_sf${SCALE_FACTOR}.json"
-    echo "RESULTS_FILE: ${RESULTS_FILE}"
-    echo "Running tpch SMJ benchmark..."
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join false --format parquet -o ${RESULTS_FILE}
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --format parquet -o ${RESULTS_FILE}
 }
 
 # Runs the tpch in memory
@@ -352,23 +342,23 @@ run_tpch_mem() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch_mem benchmark..."
     # -m means in memory
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE}
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} -m --format parquet -o ${RESULTS_FILE}
 }
 
 # Runs the parquet filter benchmark
 run_parquet() {
     RESULTS_FILE="${RESULTS_DIR}/parquet.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running parquet filter benchmark..."
-    $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
+    $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
 }
 
 # Runs the sort benchmark
 run_sort() {
     RESULTS_FILE="${RESULTS_DIR}/sort.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running sort benchmark..."
-    $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
+    $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
 }
 
 
@@ -422,7 +412,7 @@ run_clickbench_1() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (1 file) benchmark..."
-    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
+    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet"  --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
 }
 
  # Runs the clickbench benchmark with the partitioned parquet files
@@ -441,7 +431,6 @@ run_clickbench_extended() {
     $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
 }
 
-
 compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
     BRANCH1="$1"
@@ -466,13 +455,18 @@ compare_benchmarks() {
             echo "--------------------"
             echo "Benchmark ${bench}"
             echo "--------------------"
-            python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
+            PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
         else
             echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
         fi
     done
 
 }
 
+setup_venv() {
+    python3 -m venv $VIRTUAL_ENV
+    PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
+}
+
 # And start the process up
 main
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
@@ -29,7 +29,7 @@
     from rich.console import Console
     from rich.table import Table
 except ImportError:
-    print("Try `pip install rich` for using this script.")
+    print("Couldn't import modules -- run `./bench.sh venv` first")
     raise
 
 

diff --git a/.github_changelog_generator → benchmarks/requirements.txt b/.github_changelog_generator → benchmarks/requirements.txt
@@ -1,5 +1,3 @@
-#!/bin/bash
-#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,13 +14,5 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#
 
-# some issues are just documentation
-add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}}
-# uncomment to not show PRs. TBD if we shown them or not.
-#pull-requests=false
-# so that the component is shown associated with the issue
-issue-line-labels=sql
-exclude-labels=development-process,invalid
-breaking-labels=api change
+rich
-Original file line number
+Diff line change
@@ -1,2 +1,3 @@
     data
-    results
+    results
+    venv