rapidsai · rapids-bot · Aug 9, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -79,6 +79,8 @@ Debug
 build/
 cpp/build/
 cpp/examples/*/install/
+cpp/examples/*/build/
+cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 

@@ -1,38 +1,39 @@
-# TPC-H Inspired Examples
+# TPC-H Derived Examples
 
 Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
 
 ## Requirements
 
 - Rust
+- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment)
 
-## Generating the Dataset
+## Running Queries
 
-1. Clone the datafusion repository.
+1. Build the `libcudf` examples.
 ```bash
-git clone [email protected]:apache/datafusion.git
+cd cudf/cpp/examples
+./build.sh
 ```
+The TPC-H query binaries would be built inside `tpch/build`.
 
-2. Run the data generator. The data will be placed in a `data/` subdirectory.
+2. Generate the dataset.
 ```bash
-cd datafusion/benchmarks/
-./bench.sh data tpch
-
-# for scale factor 10,
-./bench.sh data tpch10
+cd tpch/datagen
+./datagen.sh [scale factor (1/10)]
 ```
 
-## Running Queries
+The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`.
 
-1. Build the examples.
+3. Set these environment variables for optimized runtimes.
 ```bash
-cd cpp/examples
-./build.sh
+export KVIKIO_COMPAT_MODE="on"
+export LIBCUDF_CUFILE_POLICY="KVIKIO"
+export CUDA_MODULE_LOADING="EAGER"
 ```
-The TPC-H query binaries would be built inside `examples/tpch/build`.
 
-2. Execute the queries.
+4. Execute the queries.
 ```bash
-./tpch/build/tpch_q1
+./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)]
 ```
-A parquet file named `q1.parquet` would be generated holding the results of the query.
+
+A parquet file named `q[query no].parquet` would be generated containing the results of the query.
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import os
+import sys
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pandas as pd
+
+if __name__ == "__main__":
+    dataset_path = str(sys.argv[1])
+    tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"]
+    for table in tables:
+        filepath = os.path.join(dataset_path, f"{table}.parquet")
+        print("Reading file ", filepath)
+
+        if filepath.endswith("lineitem.parquet"):
+            df = pd.read_parquet(filepath)
+            df["l_linenumber"] = df["l_linenumber"].astype("int64")
+            df["l_quantity"] = df["l_quantity"].astype("int64")
+            df["l_extendedprice"] = df["l_extendedprice"].astype("float64")
+            df["l_discount"] = df["l_discount"].astype("float64")
+            df["l_tax"] = df["l_tax"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("part.parquet"):
+            df = pd.read_parquet(filepath)
+            df["p_size"] = df["p_size"].astype("int64")
+            df["p_retailprice"] = df["p_retailprice"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("partsupp.parquet"):
+            df = pd.read_parquet(filepath)
+            df["ps_availqty"] = df["ps_availqty"].astype("int64")
+            df["ps_supplycost"] = df["ps_supplycost"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("orders.parquet"):
+            df = pd.read_parquet(filepath)
+            df["o_totalprice"] = df["o_totalprice"].astype("float64")
+            df["o_shippriority"] = df["o_shippriority"].astype("int64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("supplier.parquet"):
+            df = pd.read_parquet(filepath)
+            df["s_acctbal"] = df["s_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("customer.parquet"):
+            df = pd.read_parquet(filepath)
+            df["c_acctbal"] = df["c_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("nation.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("region.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -e
+
+scale_factor=$1
+script_dir=$(pwd)
+
+# Clone the datafusion repository and apply a patch
+# for single threaded data generation so that a
+# single parquet file is generated for each table
+rm -rf datafusion
+git clone https://github.com/apache/datafusion.git datafusion
+cd datafusion/
+git checkout 679a85f
+git apply ${script_dir}/tpch.patch
+cd benchmarks/
+
+# Generate the data
+# Currently, we support only scale factor 1 and 10
+if [ ${scale_factor} -eq 1 ]; then
+    ./bench.sh data tpch
+elif [ ${scale_factor} -eq 10 ]; then
+    ./bench.sh data tpch10
+else
+    echo "Unsupported scale factor"
+    exit 1
+fi
+
+# Correct the datatypes of the parquet files
+python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor}
@@ -0,0 +1,33 @@
+diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
+index 3b854f6dc..f000f09c0 100755
+--- a/benchmarks/bench.sh
++++ b/benchmarks/bench.sh
+@@ -311,6 +311,15 @@ data_tpch() {
+         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
+         popd > /dev/null
+     fi
++
++    cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet
++    cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet
++    cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet
++    cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet
++    cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet
++    cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet
++    cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet
++    cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet
+ }
+
+ # Runs the tpch benchmark
+diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
+index b5204b343..84fd2e78d 100644
+--- a/datafusion/common/src/config.rs
++++ b/datafusion/common/src/config.rs
+@@ -250,7 +250,7 @@ config_namespace! {
+         /// concurrency.
+         ///
+         /// Defaults to the number of CPU cores on the system
+-        pub target_partitions: usize, default = num_cpus::get()
++        pub target_partitions: usize, default = 1
+
+         /// The default time zone
+         ///