Skip to content

Commit

Permalink
Merge branch 'apache:main' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
comphead authored Jun 18, 2024
2 parents 9fb05c6 + a2c9d1a commit 0a7e93f
Show file tree
Hide file tree
Showing 346 changed files with 22,232 additions and 13,510 deletions.
78 changes: 45 additions & 33 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ homepage = "https://datafusion.apache.org"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/datafusion"
rust-version = "1.73"
version = "38.0.0"
rust-version = "1.75"
version = "39.0.0"

[workspace.dependencies]
# We turn off default-features for some dependencies here so the workspaces which inherit them can
Expand All @@ -64,54 +64,66 @@ version = "38.0.0"
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
arrow = { version = "51.0.0", features = ["prettyprint"] }
arrow-array = { version = "51.0.0", default-features = false, features = ["chrono-tz"] }
arrow-buffer = { version = "51.0.0", default-features = false }
arrow-flight = { version = "51.0.0", features = ["flight-sql-experimental"] }
arrow-ipc = { version = "51.0.0", default-features = false, features = ["lz4"] }
arrow-ord = { version = "51.0.0", default-features = false }
arrow-schema = { version = "51.0.0", default-features = false }
arrow-string = { version = "51.0.0", default-features = false }
arrow = { version = "52.0.0", features = [
"prettyprint",
] }
arrow-array = { version = "52.0.0", default-features = false, features = [
"chrono-tz",
] }
arrow-buffer = { version = "52.0.0", default-features = false }
arrow-flight = { version = "52.0.0", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "52.0.0", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "52.0.0", default-features = false }
arrow-schema = { version = "52.0.0", default-features = false }
arrow-string = { version = "52.0.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "=0.4.1"
bytes = "1.4"
chrono = { version = "0.4.34", default-features = false }
ctor = "0.2.0"
dashmap = "5.4.0"
datafusion = { path = "datafusion/core", version = "38.0.0", default-features = false }
datafusion-common = { path = "datafusion/common", version = "38.0.0", default-features = false }
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "38.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "38.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "38.0.0" }
datafusion-functions = { path = "datafusion/functions", version = "38.0.0" }
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "38.0.0" }
datafusion-functions-array = { path = "datafusion/functions-array", version = "38.0.0" }
datafusion-optimizer = { path = "datafusion/optimizer", version = "38.0.0", default-features = false }
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "38.0.0", default-features = false }
datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "38.0.0", default-features = false }
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "38.0.0" }
datafusion-proto = { path = "datafusion/proto", version = "38.0.0" }
datafusion-proto-common = { path = "datafusion/proto-common", version = "38.0.0" }
datafusion-sql = { path = "datafusion/sql", version = "38.0.0" }
datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "38.0.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "38.0.0" }
dashmap = "5.5.0"
datafusion = { path = "datafusion/core", version = "39.0.0", default-features = false }
datafusion-common = { path = "datafusion/common", version = "39.0.0", default-features = false }
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "39.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "39.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "39.0.0" }
datafusion-functions = { path = "datafusion/functions", version = "39.0.0" }
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "39.0.0" }
datafusion-functions-array = { path = "datafusion/functions-array", version = "39.0.0" }
datafusion-optimizer = { path = "datafusion/optimizer", version = "39.0.0", default-features = false }
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "39.0.0", default-features = false }
datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "39.0.0", default-features = false }
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "39.0.0" }
datafusion-proto = { path = "datafusion/proto", version = "39.0.0" }
datafusion-proto-common = { path = "datafusion/proto-common", version = "39.0.0" }
datafusion-sql = { path = "datafusion/sql", version = "39.0.0" }
datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "39.0.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "39.0.0" }
doc-comment = "0.3"
env_logger = "0.11"
futures = "0.3"
half = { version = "2.2.1", default-features = false }
hashbrown = { version = "0.14", features = ["raw"] }
hashbrown = { version = "0.14.5", features = ["raw"] }
indexmap = "2.0.0"
itertools = "0.12"
log = "^0.4"
num_cpus = "1.13.0"
object_store = { version = "0.9.1", default-features = false }
object_store = { version = "0.10.1", default-features = false }
parking_lot = "0.12"
parquet = { version = "51.0.0", default-features = false, features = ["arrow", "async", "object_store"] }
parquet = { version = "52.0.0", default-features = false, features = [
"arrow",
"async",
"object_store",
] }
rand = "0.8"
regex = "1.8"
rstest = "0.19.0"
rstest = "0.21.0"
serde_json = "1"
sqlparser = { version = "0.45.0", features = ["visitor"] }
sqlparser = { version = "0.47", features = ["visitor"] }
tempfile = "3"
thiserror = "1.0.44"
tokio = { version = "1.36", features = ["macros", "rt", "sync"] }
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
data
results
results
venv
10 changes: 8 additions & 2 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ Create / download a specific dataset (TPCH)

Data is placed in the `data` subdirectory.

## Select join algorithm
The benchmark runs with `prefer_hash_join == true` by default, which enforces HASH join algorithm.
To run TPCH benchmarks with join other than HASH:
```shell
PREFER_HASH_JOIN=false ./bench.sh run tpch
```

## Comparing performance of main and a branch

```shell
Expand Down Expand Up @@ -177,7 +184,6 @@ The benchmark program also supports CSV and Parquet input file formats and a uti
```bash
cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet
```

Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.

### Comparing results between runs
Expand Down Expand Up @@ -261,7 +267,7 @@ SUBCOMMANDS:

# Benchmarks

The output of `dfbench` help includes a descripion of each benchmark, which is reproducedd here for convenience
The output of `dfbench` help includes a description of each benchmark, which is reproduced here for convenience

## ClickBench

Expand Down
66 changes: 30 additions & 36 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
#CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --profile release-nonlto"} # for faster iterations
PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}

usage() {
echo "
Expand All @@ -45,30 +47,32 @@ Usage:
$0 data [benchmark]
$0 run [benchmark]
$0 compare <branch1> <branch2>
$0 venv
**********
Examples:
**********
# Create the datasets for all benchmarks in $DATA_DIR
./bench.sh data
# Run the 'tpch' benchmark on the datafusion checkout in /source/arrow-datafusion
DATAFUSION_DIR=/source/arrow-datafusion ./bench.sh run tpch
# Run the 'tpch' benchmark on the datafusion checkout in /source/datafusion
DATAFUSION_DIR=/source/datafusion ./bench.sh run tpch
**********
* Commands
**********
data: Generates or downloads data needed for benchmarking
run: Runs the named benchmark
compare: Compares results from benchmark runs
venv: Creates new venv (unless already exists) and installs compare's requirements into it
**********
* Benchmarks
**********
all(default): Data/Run/Compare for all benchmarks
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table, hash join
tpch_mem10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
Expand All @@ -79,10 +83,12 @@ clickbench_extended: ClickBench "inspired" queries against a single parquet (
**********
* Supported Configuration (Environment Variables)
**********
DATA_DIR directory to store datasets
CARGO_COMMAND command that runs the benchmark binary
DATAFUSION_DIR directory to use (default $DATAFUSION_DIR)
RESULTS_NAME folder where the benchmark files are stored
DATA_DIR directory to store datasets
CARGO_COMMAND command that runs the benchmark binary
DATAFUSION_DIR directory to use (default $DATAFUSION_DIR)
RESULTS_NAME folder where the benchmark files are stored
PREFER_HASH_JOIN Prefer hash join algorithm (default true)
VENV_PATH Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
"
exit 1
}
Expand Down Expand Up @@ -129,6 +135,7 @@ main() {
echo "BENCHMARK: ${BENCHMARK}"
echo "DATA_DIR: ${DATA_DIR}"
echo "CARGO_COMMAND: ${CARGO_COMMAND}"
echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
echo "***************************"
case "$BENCHMARK" in
all)
Expand Down Expand Up @@ -183,6 +190,7 @@ main() {
echo "DATA_DIR: ${DATA_DIR}"
echo "RESULTS_DIR: ${RESULTS_DIR}"
echo "CARGO_COMMAND: ${CARGO_COMMAND}"
echo "PREFER_HASH_JOIN": ${PREFER_HASH_JOIN}
echo "***************************"

# navigate to the appropriate directory
Expand Down Expand Up @@ -213,12 +221,6 @@ main() {
tpch_mem10)
run_tpch_mem "10"
;;
tpch_smj)
run_tpch_smj "1"
;;
tpch_smj10)
run_tpch_smj "10"
;;
parquet)
run_parquet
;;
Expand All @@ -245,6 +247,9 @@ main() {
compare)
compare_benchmarks "$ARG2" "$ARG3"
;;
venv)
setup_venv
;;
"")
usage
;;
Expand Down Expand Up @@ -321,22 +326,7 @@ run_tpch() {
RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch benchmark..."
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE}
}

# Runs the tpch benchmark with sort merge join
run_tpch_smj() {
SCALE_FACTOR=$1
if [ -z "$SCALE_FACTOR" ] ; then
echo "Internal error: Scale factor not specified"
exit 1
fi
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"

RESULTS_FILE="${RESULTS_DIR}/tpch_smj_sf${SCALE_FACTOR}.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch SMJ benchmark..."
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join false --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --format parquet -o ${RESULTS_FILE}
}

# Runs the tpch in memory
Expand All @@ -352,23 +342,23 @@ run_tpch_mem() {
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch_mem benchmark..."
# -m means in memory
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} -m --format parquet -o ${RESULTS_FILE}
}

# Runs the parquet filter benchmark
run_parquet() {
RESULTS_FILE="${RESULTS_DIR}/parquet.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running parquet filter benchmark..."
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
}

# Runs the sort benchmark
run_sort() {
RESULTS_FILE="${RESULTS_DIR}/sort.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort benchmark..."
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
}


Expand Down Expand Up @@ -422,7 +412,7 @@ run_clickbench_1() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
}

# Runs the clickbench benchmark with the partitioned parquet files
Expand All @@ -441,7 +431,6 @@ run_clickbench_extended() {
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
}


compare_benchmarks() {
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
BRANCH1="$1"
Expand All @@ -466,13 +455,18 @@ compare_benchmarks() {
echo "--------------------"
echo "Benchmark ${bench}"
echo "--------------------"
python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
else
echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
fi
done

}

setup_venv() {
python3 -m venv $VIRTUAL_ENV
PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
}

# And start the process up
main
2 changes: 1 addition & 1 deletion benchmarks/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from rich.console import Console
from rich.table import Table
except ImportError:
print("Try `pip install rich` for using this script.")
print("Couldn't import modules -- run `./bench.sh venv` first")
raise


Expand Down
12 changes: 1 addition & 11 deletions .github_changelog_generator → benchmarks/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
Expand All @@ -16,13 +14,5 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

# some issues are just documentation
add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}}
# uncomment to not show PRs. TBD if we shown them or not.
#pull-requests=false
# so that the component is shown associated with the issue
issue-line-labels=sql
exclude-labels=development-process,invalid
breaking-labels=api change
rich
Loading

0 comments on commit 0a7e93f

Please sign in to comment.