Skip to content

Commit

Permalink
Merge branch 'apache:main' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
comphead authored Nov 21, 2024
2 parents 59d9b43 + edbd93a commit 1616b99
Show file tree
Hide file tree
Showing 128 changed files with 4,484 additions and 1,579 deletions.
14 changes: 7 additions & 7 deletions .github/actions/setup-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,18 @@ runs:
- name: Install Build Dependencies
shell: bash
run: |
RETRY="ci/scripts/retry"
"${RETRY}" apt-get update
"${RETRY}" apt-get install -y protobuf-compiler
RETRY=("ci/scripts/retry" timeout 120)
"${RETRY[@]}" apt-get update
"${RETRY[@]}" apt-get install -y protobuf-compiler
- name: Setup Rust toolchain
shell: bash
# rustfmt is needed for the substrait build script
run: |
RETRY="ci/scripts/retry"
RETRY=("ci/scripts/retry" timeout 120)
echo "Installing ${{ inputs.rust-version }}"
"${RETRY}" rustup toolchain install ${{ inputs.rust-version }}
"${RETRY}" rustup default ${{ inputs.rust-version }}
"${RETRY}" rustup component add rustfmt
"${RETRY[@]}" rustup toolchain install ${{ inputs.rust-version }}
"${RETRY[@]}" rustup default ${{ inputs.rust-version }}
"${RETRY[@]}" rustup component add rustfmt
- name: Configure rust runtime env
uses: ./.github/actions/setup-rust-runtime
- name: Fixup git permissions
Expand Down
18 changes: 9 additions & 9 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,22 +74,22 @@ version = "43.0.0"
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
arrow = { version = "53.2.0", features = [
arrow = { version = "53.3.0", features = [
"prettyprint",
] }
arrow-array = { version = "53.2.0", default-features = false, features = [
arrow-array = { version = "53.3.0", default-features = false, features = [
"chrono-tz",
] }
arrow-buffer = { version = "53.2.0", default-features = false }
arrow-flight = { version = "53.2.0", features = [
arrow-buffer = { version = "53.3.0", default-features = false }
arrow-flight = { version = "53.3.0", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "53.2.0", default-features = false, features = [
arrow-ipc = { version = "53.3.0", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "53.2.0", default-features = false }
arrow-schema = { version = "53.2.0", default-features = false }
arrow-string = { version = "53.2.0", default-features = false }
arrow-ord = { version = "53.3.0", default-features = false }
arrow-schema = { version = "53.3.0", default-features = false }
arrow-string = { version = "53.3.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "=0.4.1"
bytes = "1.4"
Expand Down Expand Up @@ -131,7 +131,7 @@ log = "^0.4"
num_cpus = "1.13.0"
object_store = { version = "0.11.0", default-features = false }
parking_lot = "0.12"
parquet = { version = "53.2.0", default-features = false, features = [
parquet = { version = "53.3.0", default-features = false, features = [
"arrow",
"async",
"object_store",
Expand Down
24 changes: 24 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,30 @@ steps.
The tests sort the entire dataset using several different sort
orders.

## Sort TPCH

Test performance of end-to-end sort SQL queries. (While the `Sort` benchmark focuses on a single sort executor, this benchmark tests how sorting is executed across multiple CPU cores by benchmarking sorting the whole relational table.)

Sort integration benchmark runs whole table sort queries on TPCH `lineitem` table, with different characteristics. For example, different number of sort keys, different sort key cardinality, different number of payload columns, etc.

See [`sort_tpch.rs`](src/sort_tpch.rs) for more details.

### Sort TPCH Benchmark Example Runs
1. Run all queries with default setting:
```bash
cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json'
```

2. Run a specific query:
```bash
cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json' --query 2
```

3. Run all queries with `bench.sh` script:
```bash
./bench.sh run sort_tpch
```

## IMDB

Run Join Order Benchmark (JOB) on IMDB dataset.
Expand Down
18 changes: 18 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB),
tpch_mem10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
sort_tpch: Benchmark of sorting speed for end-to-end sort queries on TPCH dataset
clickbench_1: ClickBench queries against a single parquet file
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
Expand Down Expand Up @@ -175,6 +176,10 @@ main() {
# same data as for tpch
data_tpch "1"
;;
sort_tpch)
# same data as for tpch
data_tpch "1"
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
usage
Expand Down Expand Up @@ -252,6 +257,9 @@ main() {
external_aggr)
run_external_aggr
;;
sort_tpch)
run_sort_tpch
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for run"
usage
Expand Down Expand Up @@ -549,6 +557,16 @@ run_external_aggr() {
$CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
}

# Runs the sort integration benchmark
run_sort_tpch() {
TPCH_DIR="${DATA_DIR}/tpch_sf1"
RESULTS_FILE="${RESULTS_DIR}/sort_tpch.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort tpch benchmark..."

$CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
}


compare_benchmarks() {
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/bin/dfbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
#[global_allocator]
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;

use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, tpch};
use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, sort_tpch, tpch};

#[derive(Debug, StructOpt)]
#[structopt(about = "benchmark command")]
Expand All @@ -43,6 +43,7 @@ enum Options {
Clickbench(clickbench::RunOpt),
ParquetFilter(parquet_filter::RunOpt),
Sort(sort::RunOpt),
SortTpch(sort_tpch::RunOpt),
Imdb(imdb::RunOpt),
}

Expand All @@ -57,6 +58,7 @@ pub async fn main() -> Result<()> {
Options::Clickbench(opt) => opt.run().await,
Options::ParquetFilter(opt) => opt.run().await,
Options::Sort(opt) => opt.run().await,
Options::SortTpch(opt) => opt.run().await,
Options::Imdb(opt) => opt.run().await,
}
}
1 change: 1 addition & 0 deletions benchmarks/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ pub mod clickbench;
pub mod imdb;
pub mod parquet_filter;
pub mod sort;
pub mod sort_tpch;
pub mod tpch;
pub mod util;
Loading

0 comments on commit 1616b99

Please sign in to comment.