From 98c17d078abdb0b0df2a7d91cbfbe3082d13f3d0 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 8 Jan 2024 11:13:10 -0800 Subject: [PATCH 1/7] write cudf version in parquet created_by metadata field --- cpp/CMakeLists.txt | 8 +++++++- cpp/src/io/parquet/writer_impl.cu | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a7c34ca489c..cb1fdb1f557 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -658,6 +658,12 @@ set_source_files_properties( PROPERTIES COMPILE_DEFINITIONS "_FILE_OFFSET_BITS=64" ) +set_property( + SOURCE src/io/parquet/writer_impl.cu + APPEND + PROPERTY COMPILE_DEFINITIONS "CUDF_VERSION=${PROJECT_VERSION}" +) + set_target_properties( cudf PROPERTIES BUILD_RPATH "\$ORIGIN" diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index c452f632cd6..bd6c5ec4b93 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -171,7 +171,7 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string created_by = ""; + std::string created_by = "CUDF Version " CUDF_STRINGIFY(CUDF_VERSION); thrust::optional> column_orders = thrust::nullopt; }; From b6dae259b2fb591bfcbafa8c29ae2a360c850809 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 8 Jan 2024 11:18:29 -0800 Subject: [PATCH 2/7] lowercase cu --- cpp/src/io/parquet/writer_impl.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index bd6c5ec4b93..50532372b40 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -171,7 +171,7 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string created_by = "CUDF Version " CUDF_STRINGIFY(CUDF_VERSION); + std::string created_by = "cuDF Version " CUDF_STRINGIFY(CUDF_VERSION); thrust::optional> column_orders = thrust::nullopt; }; From 3be8bc1f2fb435cf993b80cdb3e13f2654f90b25 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 8 Jan 2024 13:43:07 -0800 Subject: [PATCH 3/7] use lower case to match other parquet implementations --- cpp/src/io/parquet/writer_impl.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 50532372b40..ae1fe7b3594 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -171,7 +171,7 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string created_by = "cuDF Version " CUDF_STRINGIFY(CUDF_VERSION); + std::string created_by = "libcudf version " CUDF_STRINGIFY(CUDF_VERSION); thrust::optional> column_orders = thrust::nullopt; }; From 275adbcd2de86301b1443735b297c2ef7c1c4d7c Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 9 Jan 2024 09:23:47 -0800 Subject: [PATCH 4/7] address review comments --- cpp/src/io/parquet/writer_impl.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ae1fe7b3594..ee6f2f39dff 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -171,7 +171,7 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string created_by = "libcudf version " CUDF_STRINGIFY(CUDF_VERSION); + std::string const created_by = "cudf version " CUDF_STRINGIFY(CUDF_VERSION); thrust::optional> column_orders = thrust::nullopt; }; From 79a2aba15dfc3e2d585af87ffca075193e089200 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 9 Jan 2024 10:19:52 -0800 Subject: [PATCH 5/7] remove created_by member from aggregate_metadata --- cpp/src/io/parquet/writer_impl.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ee6f2f39dff..8ba148ff287 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -108,7 +108,7 @@ struct aggregate_writer_metadata { meta.num_rows = this->files[part].num_rows; meta.row_groups = this->files[part].row_groups; meta.key_value_metadata = this->files[part].key_value_metadata; - meta.created_by = this->created_by; + meta.created_by = "cudf version " CUDF_STRINGIFY(CUDF_VERSION); meta.column_orders = this->column_orders; return meta; } @@ -171,7 +171,6 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string const created_by = "cudf version " CUDF_STRINGIFY(CUDF_VERSION); thrust::optional> column_orders = thrust::nullopt; }; From c7a4ef43236f3b4dd8b5f76e5bd736d655a42ca8 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 9 Jan 2024 12:22:50 -0800 Subject: [PATCH 6/7] make sure CUDF_VERSION is defined --- cpp/src/io/parquet/writer_impl.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 8ba148ff287..efd7d711a5e 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -99,6 +99,10 @@ struct aggregate_writer_metadata { } } +#ifndef CUDF_VERSION +#error "CUDF_VERSION is not defined" +#endif + FileMetaData get_metadata(size_t part) { CUDF_EXPECTS(part < files.size(), "Invalid part index queried"); From 16caa90484f1bf6b27c3882a2082ef5bd581e5cb Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 9 Jan 2024 12:46:31 -0800 Subject: [PATCH 7/7] move check --- cpp/src/io/parquet/writer_impl.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index efd7d711a5e..279a814a4e1 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -55,6 +55,10 @@ #include #include +#ifndef CUDF_VERSION +#error "CUDF_VERSION is not defined" +#endif + namespace cudf::io::parquet::detail { using namespace cudf::io::detail; @@ -99,10 +103,6 @@ struct aggregate_writer_metadata { } } -#ifndef CUDF_VERSION -#error "CUDF_VERSION is not defined" -#endif - FileMetaData get_metadata(size_t part) { CUDF_EXPECTS(part < files.size(), "Invalid part index queried");