Merge branch 'apache:main' into dev

comphead · Nov 21, 2024 · 1616b99 · 1616b99
2 parents 59d9b43 + edbd93a
commit 1616b99
Show file tree

Hide file tree

Showing 128 changed files with 4,484 additions and 1,579 deletions.
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
@@ -28,18 +28,18 @@ runs:
     - name: Install Build Dependencies
       shell: bash
       run: |
-        RETRY="ci/scripts/retry"
-        "${RETRY}" apt-get update
-        "${RETRY}" apt-get install -y protobuf-compiler
+        RETRY=("ci/scripts/retry" timeout 120)
+        "${RETRY[@]}" apt-get update
+        "${RETRY[@]}" apt-get install -y protobuf-compiler
     - name: Setup Rust toolchain
       shell: bash
       # rustfmt is needed for the substrait build script
       run: |
-        RETRY="ci/scripts/retry"
+        RETRY=("ci/scripts/retry" timeout 120)
         echo "Installing ${{ inputs.rust-version }}"
-        "${RETRY}" rustup toolchain install ${{ inputs.rust-version }}
-        "${RETRY}" rustup default ${{ inputs.rust-version }}
-        "${RETRY}" rustup component add rustfmt
+        "${RETRY[@]}" rustup toolchain install ${{ inputs.rust-version }}
+        "${RETRY[@]}" rustup default ${{ inputs.rust-version }}
+        "${RETRY[@]}" rustup component add rustfmt
     - name: Configure rust runtime env
       uses: ./.github/actions/setup-rust-runtime
     - name: Fixup git permissions

diff --git a/Cargo.toml b/Cargo.toml
@@ -74,22 +74,22 @@ version = "43.0.0"
 ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }
-arrow = { version = "53.2.0", features = [
+arrow = { version = "53.3.0", features = [
     "prettyprint",
 ] }
-arrow-array = { version = "53.2.0", default-features = false, features = [
+arrow-array = { version = "53.3.0", default-features = false, features = [
     "chrono-tz",
 ] }
-arrow-buffer = { version = "53.2.0", default-features = false }
-arrow-flight = { version = "53.2.0", features = [
+arrow-buffer = { version = "53.3.0", default-features = false }
+arrow-flight = { version = "53.3.0", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "53.2.0", default-features = false, features = [
+arrow-ipc = { version = "53.3.0", default-features = false, features = [
     "lz4",
 ] }
-arrow-ord = { version = "53.2.0", default-features = false }
-arrow-schema = { version = "53.2.0", default-features = false }
-arrow-string = { version = "53.2.0", default-features = false }
+arrow-ord = { version = "53.3.0", default-features = false }
+arrow-schema = { version = "53.3.0", default-features = false }
+arrow-string = { version = "53.3.0", default-features = false }
 async-trait = "0.1.73"
 bigdecimal = "=0.4.1"
 bytes = "1.4"
@@ -131,7 +131,7 @@ log = "^0.4"
 num_cpus = "1.13.0"
 object_store = { version = "0.11.0", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "53.2.0", default-features = false, features = [
+parquet = { version = "53.3.0", default-features = false, features = [
     "arrow",
     "async",
     "object_store",

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -330,6 +330,30 @@ steps.
 The tests sort the entire dataset using several different sort
 orders.
 
+## Sort TPCH
+
+Test performance of end-to-end sort SQL queries. (While the `Sort` benchmark focuses on a single sort executor, this benchmark tests how sorting is executed across multiple CPU cores by benchmarking sorting the whole relational table.)
+
+Sort integration benchmark runs whole table sort queries on TPCH `lineitem` table, with different characteristics. For example, different number of sort keys, different sort key cardinality, different number of payload columns, etc.
+
+See [`sort_tpch.rs`](src/sort_tpch.rs) for more details.
+
+### Sort TPCH Benchmark Example Runs
+1. Run all queries with default setting:
+```bash
+ cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json'
+```
+
+2. Run a specific query:
+```bash
+ cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json' --query 2
+```
+
+3. Run all queries with `bench.sh` script:
+```bash
+./bench.sh run sort_tpch
+```
+
 ## IMDB
 
 Run Join Order Benchmark (JOB) on IMDB dataset.

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -75,6 +75,7 @@ tpch10:                 TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB),
 tpch_mem10:             TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
 parquet:                Benchmark of parquet reader's filtering speed
 sort:                   Benchmark of sorting speed
+sort_tpch:              Benchmark of sorting speed for end-to-end sort queries on TPCH dataset
 clickbench_1:           ClickBench queries against a single parquet file
 clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
 clickbench_extended:    ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
@@ -175,6 +176,10 @@ main() {
                     # same data as for tpch
                     data_tpch "1"
                     ;;
+                sort_tpch)
+                    # same data as for tpch
+                    data_tpch "1"
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
                     usage
@@ -252,6 +257,9 @@ main() {
                 external_aggr)
                     run_external_aggr
                     ;;
+                sort_tpch)
+                    run_sort_tpch
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -549,6 +557,16 @@ run_external_aggr() {
     $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
 }
 
+# Runs the sort integration benchmark
+run_sort_tpch() {
+    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    RESULTS_FILE="${RESULTS_DIR}/sort_tpch.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running sort tpch benchmark..."
+
+    $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+}
+
 
 compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"

diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs
@@ -33,7 +33,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 #[global_allocator]
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
-use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, tpch};
+use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, sort_tpch, tpch};
 
 #[derive(Debug, StructOpt)]
 #[structopt(about = "benchmark command")]
@@ -43,6 +43,7 @@ enum Options {
     Clickbench(clickbench::RunOpt),
     ParquetFilter(parquet_filter::RunOpt),
     Sort(sort::RunOpt),
+    SortTpch(sort_tpch::RunOpt),
     Imdb(imdb::RunOpt),
 }
 
@@ -57,6 +58,7 @@ pub async fn main() -> Result<()> {
         Options::Clickbench(opt) => opt.run().await,
         Options::ParquetFilter(opt) => opt.run().await,
         Options::Sort(opt) => opt.run().await,
+        Options::SortTpch(opt) => opt.run().await,
         Options::Imdb(opt) => opt.run().await,
     }
 }
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
@@ -20,5 +20,6 @@ pub mod clickbench;
 pub mod imdb;
 pub mod parquet_filter;
 pub mod sort;
+pub mod sort_tpch;
 pub mod tpch;
 pub mod util;