diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp index 0e00d14291d..17c168f38d4 100644 --- a/cpp/include/cudf/io/avro.hpp +++ b/cpp/include/cudf/io/avro.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,9 +35,6 @@ namespace io { * @file */ -/** - * @brief Builder to build options for `read_avro()`. - */ class avro_reader_options_builder; /** @@ -57,7 +54,7 @@ class avro_reader_options { /** * @brief Constructor from source info. * - * @param src source information used to read avro file. + * @param src source information used to read avro file */ explicit avro_reader_options(source_info const& src) : _source(src) {} @@ -73,54 +70,65 @@ class avro_reader_options { /** * @brief Returns source info. + * + * @return Source info */ [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns names of the columns to be read. + * + * @return Names of the columns to be read */ [[nodiscard]] std::vector get_columns() const { return _columns; } /** * @brief Returns number of rows to skip from the start. + * + * @return Number of rows to skip from the start */ [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; } /** * @brief Returns number of rows to read. + * + * @return Number of rows to read */ [[nodiscard]] size_type get_num_rows() const { return _num_rows; } /** * @brief Set names of the column to be read. * - * @param col_names Vector of column names. + * @param col_names Vector of column names */ void set_columns(std::vector col_names) { _columns = std::move(col_names); } /** * @brief Sets number of rows to skip. * - * @param val Number of rows to skip from start. + * @param val Number of rows to skip from start */ void set_skip_rows(size_type val) { _skip_rows = val; } /** * @brief Sets number of rows to read. * - * @param val Number of rows to read after skip. + * @param val Number of rows to read after skip */ void set_num_rows(size_type val) { _num_rows = val; } /** * @brief create avro_reader_options_builder which will build avro_reader_options. * - * @param src source information used to read avro file. - * @returns builder to build reader options. + * @param src source information used to read avro file + * @returns builder to build reader options */ static avro_reader_options_builder builder(source_info const& src); }; +/** + * @brief Builder to build options for `read_avro()`. + */ class avro_reader_options_builder { avro_reader_options options; @@ -135,15 +143,15 @@ class avro_reader_options_builder { /** * @brief Constructor from source info. * - * @param src The source information used to read avro file. + * @param src The source information used to read avro file */ explicit avro_reader_options_builder(source_info const& src) : options(src) {} /** * @brief Set names of the column to be read. * - * @param col_names Vector of column names. - * @return this for chaining. + * @param col_names Vector of column names + * @return this for chaining */ avro_reader_options_builder& columns(std::vector col_names) { @@ -154,8 +162,8 @@ class avro_reader_options_builder { /** * @brief Sets number of rows to skip. * - * @param val Number of rows to skip from start. - * @return this for chaining. + * @param val Number of rows to skip from start + * @return this for chaining */ avro_reader_options_builder& skip_rows(size_type val) { @@ -166,8 +174,8 @@ class avro_reader_options_builder { /** * @brief Sets number of rows to read. * - * @param val Number of rows to read after skip. - * @return this for chaining. + * @param val Number of rows to read after skip + * @return this for chaining */ avro_reader_options_builder& num_rows(size_type val) { @@ -184,6 +192,8 @@ class avro_reader_options_builder { * @brief move avro_reader_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `avro_reader_options` object's r-value reference */ avro_reader_options&& build() { return std::move(options); } }; @@ -198,11 +208,11 @@ class avro_reader_options_builder { * auto result = cudf::io::read_avro(options); * @endcode * - * @param options Settings for controlling reading behavior. - * @param mr Device memory resource used to allocate device memory of the table in the returned. + * @param options Settings for controlling reading behavior + * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata * - * @return The set of columns along with metadata. + * @return The set of columns along with metadata */ table_with_metadata read_avro( avro_reader_options const& options, diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 44ede9b0d63..f43952c7153 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -134,7 +134,7 @@ class csv_reader_options { /** * @brief Constructor from source info. * - * @param src source information used to read csv file. + * @param src source information used to read csv file */ explicit csv_reader_options(source_info const& src) : _source(src) {} @@ -151,33 +151,43 @@ class csv_reader_options { /** * @brief Creates a `csv_reader_options_builder` which will build `csv_reader_options`. * - * @param src Source information to read csv file. - * @return Builder to build reader options. + * @param src Source information to read csv file + * @return Builder to build reader options */ static csv_reader_options_builder builder(source_info const& src); /** * @brief Returns source info. + * + * @return Source info */ [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns compression format of the source. + * + * @return Compression format of the source */ [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns number of bytes to skip from source start. + * + * @return Number of bytes to skip from source start */ [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; } /** * @brief Returns number of bytes to read. + * + * @return Number of bytes to read */ [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; } /** * @brief Returns number of bytes to read with padding. + * + * @return Number of bytes to read with padding */ [[nodiscard]] std::size_t get_byte_range_size_with_padding() const { @@ -190,6 +200,8 @@ class csv_reader_options { /** * @brief Returns number of bytes to pad when reading. + * + * @return Number of bytes to pad when reading */ [[nodiscard]] std::size_t get_byte_range_padding() const { @@ -212,21 +224,29 @@ class csv_reader_options { /** * @brief Returns names of the columns. + * + * @return Names of the columns */ [[nodiscard]] std::vector const& get_names() const { return _names; } /** * @brief Returns prefix to be used for column ID. + * + * @return Prefix to be used for column ID */ [[nodiscard]] std::string get_prefix() const { return _prefix; } /** * @brief Whether to rename duplicate column names. + * + * @return `true` if duplicate column names are renamed */ [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; } /** * @brief Returns names of the columns to be read. + * + * @return Names of the columns to be read */ [[nodiscard]] std::vector const& get_use_cols_names() const { @@ -235,91 +255,127 @@ class csv_reader_options { /** * @brief Returns indexes of columns to read. + * + * @return Indexes of columns to read */ [[nodiscard]] std::vector const& get_use_cols_indexes() const { return _use_cols_indexes; } /** * @brief Returns number of rows to read. + * + * @return Number of rows to read */ [[nodiscard]] size_type get_nrows() const { return _nrows; } /** * @brief Returns number of rows to skip from start. + * + * @return Number of rows to skip from start */ [[nodiscard]] size_type get_skiprows() const { return _skiprows; } /** * @brief Returns number of rows to skip from end. + * + * @return Number of rows to skip from end */ [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; } /** * @brief Returns header row index. + * + * @return Header row index */ [[nodiscard]] size_type get_header() const { return _header; } /** * @brief Returns line terminator. + * + * @return Line terminator */ [[nodiscard]] char get_lineterminator() const { return _lineterminator; } /** * @brief Returns field delimiter. + * + * @return Field delimiter */ [[nodiscard]] char get_delimiter() const { return _delimiter; } /** * @brief Returns numeric data thousands separator. + * + * @return Numeric data thousands separator */ [[nodiscard]] char get_thousands() const { return _thousands; } /** * @brief Returns decimal point character. + * + * @return Decimal point character */ [[nodiscard]] char get_decimal() const { return _decimal; } /** * @brief Returns comment line start character. + * + * @return Comment line start character */ [[nodiscard]] char get_comment() const { return _comment; } /** * @brief Whether to treat `\r\n` as line terminator. + * + * @return `true` if `\r\n` is treated as line terminator */ [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; } /** * @brief Whether to treat whitespace as field delimiter. + * + * @return `true` if whitespace is treated as field delimiter */ [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; } /** * @brief Whether to skip whitespace after the delimiter. + * + * @return `true` if whitespace is skipped after the delimiter */ [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; } /** * @brief Whether to ignore empty lines or parse line values as invalid. + * + * @return `true` if empty lines or parse line values are ignored as invalid */ [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; } /** * @brief Returns quoting style. + * + * @return Quoting style */ [[nodiscard]] quote_style get_quoting() const { return _quoting; } /** * @brief Returns quoting character. + * + * @return Quoting character */ [[nodiscard]] char get_quotechar() const { return _quotechar; } /** * @brief Whether a quote inside a value is double-quoted. + * + * @return `true` if a quote inside a value is double-quoted */ [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; } /** * @brief Returns names of columns to read as datetime. + * + * @return Names of columns to read as datetime */ [[nodiscard]] std::vector const& get_parse_dates_names() const { @@ -328,6 +384,8 @@ class csv_reader_options { /** * @brief Returns indexes of columns to read as datetime. + * + * @return Indexes of columns to read as datetime */ [[nodiscard]] std::vector const& get_parse_dates_indexes() const { @@ -336,6 +394,8 @@ class csv_reader_options { /** * @brief Returns names of columns to read as hexadecimal. + * + * @return Names of columns to read as hexadecimal */ [[nodiscard]] std::vector const& get_parse_hex_names() const { @@ -344,11 +404,15 @@ class csv_reader_options { /** * @brief Returns indexes of columns to read as hexadecimal. + * + * @return Indexes of columns to read as hexadecimal */ [[nodiscard]] std::vector const& get_parse_hex_indexes() const { return _parse_hex_indexes; } /** * @brief Returns per-column types. + * + * @return Per-column types */ std::variant, std::map> const& get_dtypes() const { @@ -357,50 +421,64 @@ class csv_reader_options { /** * @brief Returns additional values to recognize as boolean true values. + * + * @return Additional values to recognize as boolean true values */ std::vector const& get_true_values() const { return _true_values; } /** * @brief Returns additional values to recognize as boolean false values. + * + * @return Additional values to recognize as boolean false values */ std::vector const& get_false_values() const { return _false_values; } /** * @brief Returns additional values to recognize as null values. + * + * @return Additional values to recognize as null values */ std::vector const& get_na_values() const { return _na_values; } /** * @brief Whether to keep the built-in default NA values. + * + * @return `true` if the built-in default NA values are kept */ bool is_enabled_keep_default_na() const { return _keep_default_na; } /** * @brief Whether to disable null filter. + * + * @return `true` if null filter is enabled */ bool is_enabled_na_filter() const { return _na_filter; } /** * @brief Whether to parse dates as DD/MM versus MM/DD. + * + * @return True if dates are parsed as DD/MM, false if MM/DD */ bool is_enabled_dayfirst() const { return _dayfirst; } /** * @brief Returns timestamp_type to which all timestamp columns will be cast. + * + * @return timestamp_type to which all timestamp columns will be cast */ data_type get_timestamp_type() const { return _timestamp_type; } /** * @brief Sets compression format of the source. * - * @param comp Compression type. + * @param comp Compression type */ void set_compression(compression_type comp) { _compression = comp; } /** * @brief Sets number of bytes to skip from source start. * - * @param offset Number of bytes of offset. + * @param offset Number of bytes of offset */ void set_byte_range_offset(std::size_t offset) { @@ -415,7 +493,7 @@ class csv_reader_options { /** * @brief Sets number of bytes to read. * - * @param size Number of bytes to read. + * @param size Number of bytes to read */ void set_byte_range_size(std::size_t size) { @@ -430,28 +508,28 @@ class csv_reader_options { /** * @brief Sets names of the column. * - * @param col_names Vector of column names. + * @param col_names Vector of column names */ void set_names(std::vector col_names) { _names = std::move(col_names); } /** * @brief Sets prefix to be used for column ID. * - * @param pfx String used as prefix in for each column name. + * @param pfx String used as prefix in for each column name */ void set_prefix(std::string pfx) { _prefix = pfx; } /** * @brief Sets whether to rename duplicate column names. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_mangle_dupe_cols(bool val) { _mangle_dupe_cols = val; } /** * @brief Sets names of the columns to be read. * - * @param col_names Vector of column names that are needed. + * @param col_names Vector of column names that are needed */ void set_use_cols_names(std::vector col_names) { @@ -461,7 +539,7 @@ class csv_reader_options { /** * @brief Sets indexes of columns to read. * - * @param col_indices Vector of column indices that are needed. + * @param col_indices Vector of column indices that are needed */ void set_use_cols_indexes(std::vector col_indices) { @@ -471,7 +549,7 @@ class csv_reader_options { /** * @brief Sets number of rows to read. * - * @param nrows Number of rows to read. + * @param nrows Number of rows to read */ void set_nrows(size_type nrows) { @@ -487,7 +565,7 @@ class csv_reader_options { /** * @brief Sets number of rows to skip from start. * - * @param skip Number of rows to skip. + * @param skip Number of rows to skip */ void set_skiprows(size_type skip) { @@ -501,7 +579,7 @@ class csv_reader_options { /** * @brief Sets number of rows to skip from end. * - * @param skip Number of rows to skip. + * @param skip Number of rows to skip */ void set_skipfooter(size_type skip) { @@ -517,98 +595,98 @@ class csv_reader_options { /** * @brief Sets header row index. * - * @param hdr Index where header row is located. + * @param hdr Index where header row is located */ void set_header(size_type hdr) { _header = hdr; } /** * @brief Sets line terminator * - * @param term A character to indicate line termination. + * @param term A character to indicate line termination */ void set_lineterminator(char term) { _lineterminator = term; } /** * @brief Sets field delimiter. * - * @param delim A character to indicate delimiter. + * @param delim A character to indicate delimiter */ void set_delimiter(char delim) { _delimiter = delim; } /** * @brief Sets numeric data thousands separator. * - * @param val A character that separates thousands. + * @param val A character that separates thousands */ void set_thousands(char val) { _thousands = val; } /** * @brief Sets decimal point character. * - * @param val A character that indicates decimal values. + * @param val A character that indicates decimal values */ void set_decimal(char val) { _decimal = val; } /** * @brief Sets comment line start character. * - * @param val A character that indicates comment. + * @param val A character that indicates comment */ void set_comment(char val) { _comment = val; } /** * @brief Sets whether to treat `\r\n` as line terminator. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_windowslinetermination(bool val) { _windowslinetermination = val; } /** * @brief Sets whether to treat whitespace as field delimiter. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_delim_whitespace(bool val) { _delim_whitespace = val; } /** * @brief Sets whether to skip whitespace after the delimiter. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_skipinitialspace(bool val) { _skipinitialspace = val; } /** * @brief Sets whether to ignore empty lines or parse line values as invalid. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_skip_blank_lines(bool val) { _skip_blank_lines = val; } /** * @brief Sets quoting style. * - * @param style Quoting style used. + * @param style Quoting style used */ void set_quoting(quote_style style) { _quoting = style; } /** * @brief Sets quoting character. * - * @param ch A character to indicate quoting. + * @param ch A character to indicate quoting */ void set_quotechar(char ch) { _quotechar = ch; } /** * @brief Sets a quote inside a value is double-quoted. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_doublequote(bool val) { _doublequote = val; } /** * @brief Sets names of columns to read as datetime. * - * @param col_names Vector of column names to infer as datetime. + * @param col_names Vector of column names to infer as datetime */ void set_parse_dates(std::vector col_names) { @@ -618,7 +696,7 @@ class csv_reader_options { /** * @brief Sets indexes of columns to read as datetime. * - * @param col_indices Vector of column indices to infer as datetime. + * @param col_indices Vector of column indices to infer as datetime */ void set_parse_dates(std::vector col_indices) { @@ -652,14 +730,14 @@ class csv_reader_options { /** * @brief Sets per-column types * - * @param types Vector specifying the columns' target data types. + * @param types Vector specifying the columns' target data types */ void set_dtypes(std::vector types) { _dtypes = std::move(types); } /** * @brief Sets additional values to recognize as boolean true values. * - * @param vals Vector of values to be considered to be `true`. + * @param vals Vector of values to be considered to be `true` */ void set_true_values(std::vector vals) { @@ -669,7 +747,7 @@ class csv_reader_options { /** * @brief Sets additional values to recognize as boolean false values. * - * @param vals Vector of values to be considered to be `false`. + * @param vals Vector of values to be considered to be `false` */ void set_false_values(std::vector vals) { @@ -679,7 +757,7 @@ class csv_reader_options { /** * @brief Sets additional values to recognize as null values. * - * @param vals Vector of values to be considered to be null. + * @param vals Vector of values to be considered to be null */ void set_na_values(std::vector vals) { @@ -693,14 +771,14 @@ class csv_reader_options { /** * @brief Sets whether to keep the built-in default NA values. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_keep_default_na(bool val) { _keep_default_na = val; } /** * @brief Sets whether to disable null filter. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_na_filter(bool val) { @@ -711,20 +789,24 @@ class csv_reader_options { /** * @brief Sets whether to parse dates as DD/MM versus MM/DD. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_dayfirst(bool val) { _dayfirst = val; } /** * @brief Sets timestamp_type to which all timestamp columns will be cast. * - * @param type Dtype to which all timestamp column will be cast. + * @param type Dtype to which all timestamp column will be cast */ void set_timestamp_type(data_type type) { _timestamp_type = type; } }; +/** + * @brief Builder to build options for `read_csv()`. + * + */ class csv_reader_options_builder { - csv_reader_options options; + csv_reader_options options; ///< Options to be built. public: /** @@ -737,15 +819,15 @@ class csv_reader_options_builder { /** * @brief Constructor from source info. * - * @param src The source information used to read csv file. + * @param src The source information used to read csv file */ csv_reader_options_builder(source_info const& src) : options(src) {} /** * @brief Sets compression format of the source. * - * @param comp Compression type. - * @return this for chaining. + * @param comp Compression type + * @return this for chaining */ csv_reader_options_builder& compression(compression_type comp) { @@ -756,8 +838,8 @@ class csv_reader_options_builder { /** * @brief Sets number of bytes to skip from source start. * - * @param offset Number of bytes of offset. - * @return this for chaining. + * @param offset Number of bytes of offset + * @return this for chaining */ csv_reader_options_builder& byte_range_offset(std::size_t offset) { @@ -768,8 +850,8 @@ class csv_reader_options_builder { /** * @brief Sets number of bytes to read. * - * @param size Number of bytes to read. - * @return this for chaining. + * @param size Number of bytes to read + * @return this for chaining */ csv_reader_options_builder& byte_range_size(std::size_t size) { @@ -780,8 +862,8 @@ class csv_reader_options_builder { /** * @brief Sets names of the column. * - * @param col_names Vector of column names. - * @return this for chaining. + * @param col_names Vector of column names + * @return this for chaining */ csv_reader_options_builder& names(std::vector col_names) { @@ -792,8 +874,8 @@ class csv_reader_options_builder { /** * @brief Sets prefix to be used for column ID. * - * @param pfx String used as prefix in for each column name. - * @return this for chaining. + * @param pfx String used as prefix in for each column name + * @return this for chaining */ csv_reader_options_builder& prefix(std::string pfx) { @@ -804,8 +886,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to rename duplicate column names. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& mangle_dupe_cols(bool val) { @@ -816,8 +898,8 @@ class csv_reader_options_builder { /** * @brief Sets names of the columns to be read. * - * @param col_names Vector of column names that are needed. - * @return this for chaining. + * @param col_names Vector of column names that are needed + * @return this for chaining */ csv_reader_options_builder& use_cols_names(std::vector col_names) { @@ -828,8 +910,8 @@ class csv_reader_options_builder { /** * @brief Sets indexes of columns to read. * - * @param col_indices Vector of column indices that are needed. - * @return this for chaining. + * @param col_indices Vector of column indices that are needed + * @return this for chaining */ csv_reader_options_builder& use_cols_indexes(std::vector col_indices) { @@ -840,8 +922,8 @@ class csv_reader_options_builder { /** * @brief Sets number of rows to read. * - * @param rows Number of rows to read. - * @return this for chaining. + * @param rows Number of rows to read + * @return this for chaining */ csv_reader_options_builder& nrows(size_type rows) { @@ -852,8 +934,8 @@ class csv_reader_options_builder { /** * @brief Sets number of rows to skip from start. * - * @param skip Number of rows to skip. - * @return this for chaining. + * @param skip Number of rows to skip + * @return this for chaining */ csv_reader_options_builder& skiprows(size_type skip) { @@ -864,8 +946,8 @@ class csv_reader_options_builder { /** * @brief Sets number of rows to skip from end. * - * @param skip Number of rows to skip. - * @return this for chaining. + * @param skip Number of rows to skip + * @return this for chaining */ csv_reader_options_builder& skipfooter(size_type skip) { @@ -876,8 +958,8 @@ class csv_reader_options_builder { /** * @brief Sets header row index. * - * @param hdr Index where header row is located. - * @return this for chaining. + * @param hdr Index where header row is located + * @return this for chaining */ csv_reader_options_builder& header(size_type hdr) { @@ -888,8 +970,8 @@ class csv_reader_options_builder { /** * @brief Sets line terminator. * - * @param term A character to indicate line termination. - * @return this for chaining. + * @param term A character to indicate line termination + * @return this for chaining */ csv_reader_options_builder& lineterminator(char term) { @@ -900,8 +982,8 @@ class csv_reader_options_builder { /** * @brief Sets field delimiter * - * @param delim A character to indicate delimiter. - * @return this for chaining. + * @param delim A character to indicate delimiter + * @return this for chaining */ csv_reader_options_builder& delimiter(char delim) { @@ -912,8 +994,8 @@ class csv_reader_options_builder { /** * @brief Sets numeric data thousands separator. * - * @param val A character that separates thousands. - * @return this for chaining. + * @param val A character that separates thousands + * @return this for chaining */ csv_reader_options_builder& thousands(char val) { @@ -924,8 +1006,8 @@ class csv_reader_options_builder { /** * @brief Sets decimal point character. * - * @param val A character that indicates decimal values. - * @return this for chaining. + * @param val A character that indicates decimal values + * @return this for chaining */ csv_reader_options_builder& decimal(char val) { @@ -936,8 +1018,8 @@ class csv_reader_options_builder { /** * @brief Sets comment line start character. * - * @param val A character that indicates comment. - * @return this for chaining. + * @param val A character that indicates comment + * @return this for chaining */ csv_reader_options_builder& comment(char val) { @@ -948,8 +1030,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to treat `\r\n` as line terminator. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& windowslinetermination(bool val) { @@ -960,8 +1042,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to treat whitespace as field delimiter. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& delim_whitespace(bool val) { @@ -972,8 +1054,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to skip whitespace after the delimiter. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& skipinitialspace(bool val) { @@ -984,8 +1066,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to ignore empty lines or parse line values as invalid. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& skip_blank_lines(bool val) { @@ -996,8 +1078,8 @@ class csv_reader_options_builder { /** * @brief Sets quoting style. * - * @param style Quoting style used. - * @return this for chaining. + * @param style Quoting style used + * @return this for chaining */ csv_reader_options_builder& quoting(quote_style style) { @@ -1008,8 +1090,8 @@ class csv_reader_options_builder { /** * @brief Sets quoting character. * - * @param ch A character to indicate quoting. - * @return this for chaining. + * @param ch A character to indicate quoting + * @return this for chaining */ csv_reader_options_builder& quotechar(char ch) { @@ -1020,8 +1102,8 @@ class csv_reader_options_builder { /** * @brief Sets a quote inside a value is double-quoted. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& doublequote(bool val) { @@ -1032,8 +1114,8 @@ class csv_reader_options_builder { /** * @brief Sets names of columns to read as datetime. * - * @param col_names Vector of column names to read as datetime. - * @return this for chaining. + * @param col_names Vector of column names to read as datetime + * @return this for chaining */ csv_reader_options_builder& parse_dates(std::vector col_names) { @@ -1045,7 +1127,7 @@ class csv_reader_options_builder { * @brief Sets indexes of columns to read as datetime. * * @param col_indices Vector of column indices to read as datetime - * @return this for chaining. + * @return this for chaining */ csv_reader_options_builder& parse_dates(std::vector col_indices) { @@ -1057,7 +1139,7 @@ class csv_reader_options_builder { * @brief Sets names of columns to parse as hexadecimal. * * @param col_names Vector of column names to parse as hexadecimal - * @return this for chaining. + * @return this for chaining */ csv_reader_options_builder& parse_hex(std::vector col_names) { @@ -1069,7 +1151,7 @@ class csv_reader_options_builder { * @brief Sets indexes of columns to parse as hexadecimal. * * @param col_indices Vector of column indices to parse as hexadecimal - * @return this for chaining. + * @return this for chaining */ csv_reader_options_builder& parse_hex(std::vector col_indices) { @@ -1081,7 +1163,7 @@ class csv_reader_options_builder { * @brief Sets per-column types. * * @param types Column name -> data type map specifying the columns' target data types - * @return this for chaining. + * @return this for chaining */ csv_reader_options_builder& dtypes(std::map types) { @@ -1092,8 +1174,8 @@ class csv_reader_options_builder { /** * @brief Sets per-column types. * - * @param types Vector of data types in which the column needs to be read. - * @return this for chaining. + * @param types Vector of data types in which the column needs to be read + * @return this for chaining */ csv_reader_options_builder& dtypes(std::vector types) { @@ -1104,8 +1186,8 @@ class csv_reader_options_builder { /** * @brief Sets additional values to recognize as boolean true values. * - * @param vals Vector of values to be considered to be `true`. - * @return this for chaining. + * @param vals Vector of values to be considered to be `true` + * @return this for chaining */ csv_reader_options_builder& true_values(std::vector vals) { @@ -1116,8 +1198,8 @@ class csv_reader_options_builder { /** * @brief Sets additional values to recognize as boolean false values. * - * @param vals Vector of values to be considered to be `false`. - * @return this for chaining. + * @param vals Vector of values to be considered to be `false` + * @return this for chaining */ csv_reader_options_builder& false_values(std::vector vals) { @@ -1128,8 +1210,8 @@ class csv_reader_options_builder { /** * @brief Sets additional values to recognize as null values. * - * @param vals Vector of values to be considered to be null. - * @return this for chaining. + * @param vals Vector of values to be considered to be null + * @return this for chaining */ csv_reader_options_builder& na_values(std::vector vals) { @@ -1140,8 +1222,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to keep the built-in default NA values. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& keep_default_na(bool val) { @@ -1152,8 +1234,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to disable null filter. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& na_filter(bool val) { @@ -1164,8 +1246,8 @@ class csv_reader_options_builder { /** * @brief Sets whether to parse dates as DD/MM versus MM/DD. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_reader_options_builder& dayfirst(bool val) { @@ -1176,8 +1258,8 @@ class csv_reader_options_builder { /** * @brief Sets timestamp_type to which all timestamp columns will be cast. * - * @param type Dtype to which all timestamp column will be cast. - * @return this for chaining. + * @param type Dtype to which all timestamp column will be cast + * @return this for chaining */ csv_reader_options_builder& timestamp_type(data_type type) { @@ -1194,6 +1276,8 @@ class csv_reader_options_builder { * @brief move csv_reader_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `csv_reader_options` object's r-value reference */ csv_reader_options&& build() { return std::move(options); } }; @@ -1208,11 +1292,11 @@ class csv_reader_options_builder { * auto result = cudf::io::read_csv(options); * @endcode * - * @param options Settings for controlling reading behavior. - * @param mr Device memory resource used to allocate device memory of the table in the returned. + * @param options Settings for controlling reading behavior + * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata * - * @return The set of columns along with metadata. + * @return The set of columns along with metadata */ table_with_metadata read_csv( csv_reader_options options, @@ -1258,8 +1342,8 @@ class csv_writer_options { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output */ explicit csv_writer_options(sink_info const& sink, table_view const& table) : _sink(sink), _table(table), _rows_per_chunk(table.num_rows()) @@ -1279,60 +1363,80 @@ class csv_writer_options { /** * @brief Create builder to create `csv_writer_options`. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output * - * @return Builder to build csv_writer_options. + * @return Builder to build csv_writer_options */ static csv_writer_options_builder builder(sink_info const& sink, table_view const& table); /** * @brief Returns sink used for writer output. + * + * @return sink used for writer output */ [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns table that would be written to output. + * + * @return Table that would be written to output */ [[nodiscard]] table_view const& get_table() const { return _table; } /** * @brief Returns optional associated metadata. + * + * @return Optional associated metadata */ [[nodiscard]] table_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns string to used for null entries. + * + * @return string to used for null entries */ [[nodiscard]] std::string get_na_rep() const { return _na_rep; } /** * @brief Whether to write headers to csv. + * + * @return `true` if writing headers to csv */ [[nodiscard]] bool is_enabled_include_header() const { return _include_header; } /** * @brief Returns maximum number of rows to process for each file write. + * + * @return Maximum number of rows to process for each file write */ [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; } /** * @brief Returns character used for separating lines. + * + * @return Character used for separating lines */ [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; } /** * @brief Returns character used for separating lines. + * + * @return Character used for separating lines */ [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; } /** * @brief Returns string used for values != 0 in INT8 types. + * + * @return string used for values != 0 in INT8 types */ [[nodiscard]] std::string get_true_value() const { return _true_value; } /** * @brief Returns string used for values == 0 in INT8 types. + * + * @return string used for values == 0 in INT8 types */ [[nodiscard]] std::string get_false_value() const { return _false_value; } @@ -1340,62 +1444,65 @@ class csv_writer_options { /** * @brief Sets optional associated metadata. * - @param metadata Associated metadata. + @param metadata Associated metadata */ void set_metadata(table_metadata* metadata) { _metadata = metadata; } /** * @brief Sets string to used for null entries. * - * @param val String to represent null value. + * @param val String to represent null value */ void set_na_rep(std::string val) { _na_rep = val; } /** * @brief Enables/Disables headers being written to csv. * - * @param val Boolean value to enable/disable. + * @param val Boolean value to enable/disable */ void enable_include_header(bool val) { _include_header = val; } /** * @brief Sets maximum number of rows to process for each file write. * - * @param val Number of rows per chunk. + * @param val Number of rows per chunk */ void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; } /** * @brief Sets character used for separating lines. * - * @param term Character to represent line termination. + * @param term Character to represent line termination */ void set_line_terminator(std::string term) { _line_terminator = term; } /** * @brief Sets character used for separating lines. * - * @param delim Character to indicate delimiting. + * @param delim Character to indicate delimiting */ void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; } /** * @brief Sets string used for values != 0 in INT8 types. * - * @param val String to represent values != 0 in INT8 types. + * @param val String to represent values != 0 in INT8 types */ void set_true_value(std::string val) { _true_value = val; } /** * @brief Sets string used for values == 0 in INT8 types. * - * @param val String to represent values == 0 in INT8 types. + * @param val String to represent values == 0 in INT8 types */ void set_false_value(std::string val) { _false_value = val; } }; +/** + * @brief Builder to build options for `writer_csv()` + */ class csv_writer_options_builder { - csv_writer_options options; + csv_writer_options options; ///< Options to be built. public: /** @@ -1408,8 +1515,8 @@ class csv_writer_options_builder { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output */ explicit csv_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table} @@ -1419,8 +1526,8 @@ class csv_writer_options_builder { /** * @brief Sets optional associated metadata. * - * @param metadata Associated metadata. - * @return this for chaining. + * @param metadata Associated metadata + * @return this for chaining */ csv_writer_options_builder& metadata(table_metadata* metadata) { @@ -1431,8 +1538,8 @@ class csv_writer_options_builder { /** * @brief Sets string to used for null entries. * - * @param val String to represent null value. - * @return this for chaining. + * @param val String to represent null value + * @return this for chaining */ csv_writer_options_builder& na_rep(std::string val) { @@ -1443,8 +1550,8 @@ class csv_writer_options_builder { /** * @brief Enables/Disables headers being written to csv. * - * @param val Boolean value to enable/disable. - * @return this for chaining. + * @param val Boolean value to enable/disable + * @return this for chaining */ csv_writer_options_builder& include_header(bool val) { @@ -1455,8 +1562,8 @@ class csv_writer_options_builder { /** * @brief Sets maximum number of rows to process for each file write. * - * @param val Number of rows per chunk. - * @return this for chaining. + * @param val Number of rows per chunk + * @return this for chaining */ csv_writer_options_builder& rows_per_chunk(int val) { @@ -1467,8 +1574,8 @@ class csv_writer_options_builder { /** * @brief Sets character used for separating lines. * - * @param term Character to represent line termination. - * @return this for chaining. + * @param term Character to represent line termination + * @return this for chaining */ csv_writer_options_builder& line_terminator(std::string term) { @@ -1479,8 +1586,8 @@ class csv_writer_options_builder { /** * @brief Sets character used for separating lines. * - * @param delim Character to indicate delimiting. - * @return this for chaining. + * @param delim Character to indicate delimiting + * @return this for chaining */ csv_writer_options_builder& inter_column_delimiter(char delim) { @@ -1491,8 +1598,8 @@ class csv_writer_options_builder { /** * @brief Sets string used for values != 0 in INT8 types. * - * @param val String to represent values != 0 in INT8 types. - * @return this for chaining. + * @param val String to represent values != 0 in INT8 types + * @return this for chaining */ csv_writer_options_builder& true_value(std::string val) { @@ -1503,8 +1610,8 @@ class csv_writer_options_builder { /** * @brief Sets string used for values == 0 in INT8 types. * - * @param val String to represent values == 0 in INT8 types. - * @return this for chaining. + * @param val String to represent values == 0 in INT8 types + * @return this for chaining */ csv_writer_options_builder& false_value(std::string val) { @@ -1521,6 +1628,8 @@ class csv_writer_options_builder { * @brief move `csv_writer_options` member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `csv_writer_options` object's r-value reference */ csv_writer_options&& build() { return std::move(options); } }; @@ -1539,8 +1648,8 @@ class csv_writer_options_builder { * cudf::io::write_csv(options); * @endcode * - * @param options Settings for controlling writing behavior. - * @param mr Device memory resource to use for device memory allocation. + * @param options Settings for controlling writing behavior + * @param mr Device memory resource to use for device memory allocation */ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index e2d4de83b49..9ccb5ec4d58 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -36,9 +36,6 @@ namespace io { * @file */ -/** - * @brief Builds settings to use for `read_json()`. - */ class json_reader_options_builder; /** @@ -86,7 +83,7 @@ class json_reader_options { /** * @brief Constructor from source info. * - * @param src source information used to read parquet file. + * @param src source information used to read parquet file */ explicit json_reader_options(const source_info& src) : _source(src) {} @@ -103,18 +100,22 @@ class json_reader_options { /** * @brief create json_reader_options_builder which will build json_reader_options. * - * @param src source information used to read json file. - * @returns builder to build the options. + * @param src source information used to read json file + * @returns builder to build the options */ static json_reader_options_builder builder(source_info const& src); /** * @brief Returns source info. + * + * @returns Source info */ [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns data types of the columns. + * + * @returns Data types of the columns */ std::variant, std::map> const& get_dtypes() const { @@ -123,21 +124,29 @@ class json_reader_options { /** * @brief Returns compression format of the source. + * + * @return Compression format of the source */ compression_type get_compression() const { return _compression; } /** * @brief Returns number of bytes to skip from source start. + * + * @return Number of bytes to skip from source start */ size_t get_byte_range_offset() const { return _byte_range_offset; } /** * @brief Returns number of bytes to read. + * + * @return Number of bytes to read */ size_t get_byte_range_size() const { return _byte_range_size; } /** * @brief Returns number of bytes to read with padding. + * + * @return Number of bytes to read with padding */ size_t get_byte_range_size_with_padding() const { @@ -150,6 +159,8 @@ class json_reader_options { /** * @brief Returns number of bytes to pad when reading. + * + * @return Number of bytes to pad */ size_t get_byte_range_padding() const { @@ -170,11 +181,15 @@ class json_reader_options { /** * @brief Whether to read the file as a json object per line. + * + * @return `true` if reading the file as a json object per line */ bool is_enabled_lines() const { return _lines; } /** * @brief Whether to parse dates as DD/MM versus MM/DD. + * + * @returns true if dates are parsed as DD/MM, false if MM/DD */ bool is_enabled_dayfirst() const { return _dayfirst; } @@ -188,46 +203,49 @@ class json_reader_options { /** * @brief Set data types for columns to be read. * - * @param types Vector dtypes in string format. + * @param types Vector dtypes in string format */ void set_dtypes(std::map types) { _dtypes = std::move(types); } /** * @brief Set the compression type. * - * @param comp_type The compression type used. + * @param comp_type The compression type used */ void set_compression(compression_type comp_type) { _compression = comp_type; } /** * @brief Set number of bytes to skip from source start. * - * @param offset Number of bytes of offset. + * @param offset Number of bytes of offset */ void set_byte_range_offset(size_type offset) { _byte_range_offset = offset; } /** * @brief Set number of bytes to read. * - * @param size Number of bytes to read. + * @param size Number of bytes to read */ void set_byte_range_size(size_type size) { _byte_range_size = size; } /** * @brief Set whether to read the file as a json object per line. * - * @param val Boolean value to enable/disable the option to read each line as a json object. + * @param val Boolean value to enable/disable the option to read each line as a json object */ void enable_lines(bool val) { _lines = val; } /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * - * @param val Boolean value to enable/disable day first parsing format. + * @param val Boolean value to enable/disable day first parsing format */ void enable_dayfirst(bool val) { _dayfirst = val; } }; +/** + * @brief Builds settings to use for `read_json()`. + */ class json_reader_options_builder { json_reader_options options; @@ -242,7 +260,7 @@ class json_reader_options_builder { /** * @brief Constructor from source info. * - * @param src The source information used to read avro file. + * @param src The source information used to read avro file */ explicit json_reader_options_builder(source_info const& src) : options(src) {} @@ -261,7 +279,7 @@ class json_reader_options_builder { /** * @brief Set data types for columns to be read. * - * @param types Column name -> dtype map. + * @param types Column name -> dtype map * @return this for chaining */ json_reader_options_builder& dtypes(std::map types) @@ -273,8 +291,8 @@ class json_reader_options_builder { /** * @brief Set the compression type. * - * @param comp_type The compression type used. - * @return this for chaining. + * @param comp_type The compression type used + * @return this for chaining */ json_reader_options_builder& compression(compression_type comp_type) { @@ -285,8 +303,8 @@ class json_reader_options_builder { /** * @brief Set number of bytes to skip from source start. * - * @param offset Number of bytes of offset. - * @return this for chaining. + * @param offset Number of bytes of offset + * @return this for chaining */ json_reader_options_builder& byte_range_offset(size_type offset) { @@ -297,7 +315,7 @@ class json_reader_options_builder { /** * @brief Set number of bytes to read. * - * @param size Number of bytes to read. + * @param size Number of bytes to read * @return this for chaining */ json_reader_options_builder& byte_range_size(size_type size) @@ -309,8 +327,8 @@ class json_reader_options_builder { /** * @brief Set whether to read the file as a json object per line. * - * @param val Boolean value to enable/disable the option to read each line as a json object. - * @return this for chaining. + * @param val Boolean value to enable/disable the option to read each line as a json object + * @return this for chaining */ json_reader_options_builder& lines(bool val) { @@ -321,8 +339,8 @@ class json_reader_options_builder { /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * - * @param val Boolean value to enable/disable day first parsing format. - * @return this for chaining. + * @param val Boolean value to enable/disable day first parsing format + * @return this for chaining */ json_reader_options_builder& dayfirst(bool val) { @@ -339,6 +357,8 @@ class json_reader_options_builder { * @brief move json_reader_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `json_reader_options` object r-value reference */ json_reader_options&& build() { return std::move(options); } }; @@ -353,11 +373,11 @@ class json_reader_options_builder { * auto result = cudf::io::read_json(options); * @endcode * - * @param options Settings for controlling reading behavior. + * @param options Settings for controlling reading behavior * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata. * - * @return The set of columns along with metadata. + * @return The set of columns along with metadata */ table_with_metadata read_json( json_reader_options options, diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 9e8fd1244d0..e9b6818099e 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -34,9 +34,9 @@ namespace io { * @file */ -constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024; -constexpr size_type default_stripe_size_rows = 1000000; -constexpr size_type default_row_index_stride = 10000; +constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024; ///< 64MB default orc stripe size +constexpr size_type default_stripe_size_rows = 1000000; ///< 1M rows default orc stripe rows +constexpr size_type default_row_index_stride = 10000; ///< 10K rows default orc row index stride /** * @brief Builds settings to use for `read_orc()`. @@ -75,7 +75,7 @@ class orc_reader_options { /** * @brief Constructor from source info. * - * @param src source information used to read orc file. + * @param src source information used to read orc file */ explicit orc_reader_options(source_info const& src) : _source(src) {} @@ -90,53 +90,71 @@ class orc_reader_options { /** * @brief Creates `orc_reader_options_builder` which will build `orc_reader_options`. * - * @param src Source information to read orc file. - * @return Builder to build reader options. + * @param src Source information to read orc file + * @return Builder to build reader options */ static orc_reader_options_builder builder(source_info const& src); /** * @brief Returns source info. + * + * @return Source info */ [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns names of the columns to read. + * + * @return Names of the columns to read */ [[nodiscard]] std::vector const& get_columns() const { return _columns; } /** * @brief Returns vector of vectors, stripes to read for each input source + * + * @return Vector of vectors, stripes to read for each input source */ std::vector> const& get_stripes() const { return _stripes; } /** * @brief Returns number of rows to skip from the start. + * + * @return Number of rows to skip from the start */ size_type get_skip_rows() const { return _skip_rows; } /** * @brief Returns number of row to read. + * + * @return Number of row to read */ size_type get_num_rows() const { return _num_rows; } /** * @brief Whether to use row index to speed-up reading. + * + * @return `true` if row index is used to speed-up reading */ bool is_enabled_use_index() const { return _use_index; } /** * @brief Whether to use numpy-compatible dtypes. + * + * @return `true` if numpy-compatible dtypes are used */ bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; } /** * @brief Returns timestamp type to which timestamp column will be cast. + * + * @return Timestamp type to which timestamp column will be cast */ data_type get_timestamp_type() const { return _timestamp_type; } /** - * @brief Fully qualified names of columns that should be read as 128-bit Decimal. + * @brief Returns fully qualified names of columns that should be read as 128-bit Decimal. + * + * @return Fully qualified names of columns that should be read as 128-bit Decimal */ std::vector const& get_decimal128_columns() const { return _decimal128_columns; } @@ -145,7 +163,7 @@ class orc_reader_options { /** * @brief Sets names of the column to read. * - * @param col_names Vector of column names. + * @param col_names Vector of column names */ void set_columns(std::vector col_names) { _columns = std::move(col_names); } @@ -164,7 +182,7 @@ class orc_reader_options { /** * @brief Sets number of rows to skip from the start. * - * @param rows Number of rows. + * @param rows Number of rows */ void set_skip_rows(size_type rows) { @@ -175,7 +193,7 @@ class orc_reader_options { /** * @brief Sets number of row to read. * - * @param nrows Number of rows. + * @param nrows Number of rows */ void set_num_rows(size_type nrows) { @@ -186,28 +204,28 @@ class orc_reader_options { /** * @brief Enable/Disable use of row index to speed-up reading. * - * @param use Boolean value to enable/disable row index use. + * @param use Boolean value to enable/disable row index use */ void enable_use_index(bool use) { _use_index = use; } /** * @brief Enable/Disable use of numpy-compatible dtypes * - * @param use Boolean value to enable/disable. + * @param use Boolean value to enable/disable */ void enable_use_np_dtypes(bool use) { _use_np_dtypes = use; } /** * @brief Sets timestamp type to which timestamp column will be cast. * - * @param type Type of timestamp. + * @param type Type of timestamp */ void set_timestamp_type(data_type type) { _timestamp_type = type; } /** * @brief Set columns that should be read as 128-bit Decimal * - * @param val Vector of fully qualified column names. + * @param val Vector of fully qualified column names */ void set_decimal128_columns(std::vector val) { @@ -215,6 +233,9 @@ class orc_reader_options { } }; +/** + * @brief Builds settings to use for `read_orc()`. + */ class orc_reader_options_builder { orc_reader_options options; @@ -229,15 +250,15 @@ class orc_reader_options_builder { /** * @brief Constructor from source info. * - * @param src The source information used to read orc file. + * @param src The source information used to read orc file */ explicit orc_reader_options_builder(source_info const& src) : options{src} {}; /** * @brief Sets names of the column to read. * - * @param col_names Vector of column names. - * @return this for chaining. + * @param col_names Vector of column names + * @return this for chaining */ orc_reader_options_builder& columns(std::vector col_names) { @@ -249,7 +270,7 @@ class orc_reader_options_builder { * @brief Sets list of individual stripes to read per source * * @param stripes Vector of vectors, mapping stripes to read to input sources - * @return this for chaining. + * @return this for chaining */ orc_reader_options_builder& stripes(std::vector> stripes) { @@ -260,8 +281,8 @@ class orc_reader_options_builder { /** * @brief Sets number of rows to skip from the start. * - * @param rows Number of rows. - * @return this for chaining. + * @param rows Number of rows + * @return this for chaining */ orc_reader_options_builder& skip_rows(size_type rows) { @@ -272,8 +293,8 @@ class orc_reader_options_builder { /** * @brief Sets number of row to read. * - * @param nrows Number of rows. - * @return this for chaining. + * @param nrows Number of rows + * @return this for chaining */ orc_reader_options_builder& num_rows(size_type nrows) { @@ -284,8 +305,8 @@ class orc_reader_options_builder { /** * @brief Enable/Disable use of row index to speed-up reading. * - * @param use Boolean value to enable/disable row index use. - * @return this for chaining. + * @param use Boolean value to enable/disable row index use + * @return this for chaining */ orc_reader_options_builder& use_index(bool use) { @@ -296,8 +317,8 @@ class orc_reader_options_builder { /** * @brief Enable/Disable use of numpy-compatible dtypes. * - * @param use Boolean value to enable/disable. - * @return this for chaining. + * @param use Boolean value to enable/disable + * @return this for chaining */ orc_reader_options_builder& use_np_dtypes(bool use) { @@ -308,8 +329,8 @@ class orc_reader_options_builder { /** * @brief Sets timestamp type to which timestamp column will be cast. * - * @param type Type of timestamp. - * @return this for chaining. + * @param type Type of timestamp + * @return this for chaining */ orc_reader_options_builder& timestamp_type(data_type type) { @@ -320,8 +341,8 @@ class orc_reader_options_builder { /** * @brief Columns that should be read as 128-bit Decimal * - * @param val Vector of column names. - * @return this for chaining. + * @param val Vector of column names + * @return this for chaining */ orc_reader_options_builder& decimal128_columns(std::vector val) { @@ -338,6 +359,8 @@ class orc_reader_options_builder { * @brief move orc_reader_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `orc_reader_options` object's r-value reference */ orc_reader_options&& build() { return std::move(options); } }; @@ -355,11 +378,11 @@ class orc_reader_options_builder { * Note: Support for reading files with struct columns is currently experimental, the output may not * be as reliable as reading for other datatypes. * - * @param options Settings for controlling reading behavior. + * @param options Settings for controlling reading behavior * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata. * - * @return The set of columns. + * @return The set of columns */ table_with_metadata read_orc( orc_reader_options const& options, @@ -417,8 +440,8 @@ class orc_writer_options { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output */ explicit orc_writer_options(sink_info const& sink, table_view const& table) : _sink(sink), _table(table) @@ -436,25 +459,31 @@ class orc_writer_options { /** * @brief Create builder to create `orc_writer_options`. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output * - * @return Builder to build `orc_writer_options`. + * @return Builder to build `orc_writer_options` */ static orc_writer_options_builder builder(sink_info const& sink, table_view const& table); /** * @brief Returns sink info. + * + * @return Sink info */ [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression type. + * + * @return Compression type */ [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Whether writing column statistics is enabled/disabled. + * + * @return `true` if writing column statistics is enabled */ [[nodiscard]] bool is_enabled_statistics() const { @@ -463,21 +492,29 @@ class orc_writer_options { /** * @brief Returns frequency of statistics collection. + * + * @return Frequency of statistics collection */ [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; } /** * @brief Returns maximum stripe size, in bytes. + * + * @return Maximum stripe size, in bytes */ [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; } /** * @brief Returns maximum stripe size, in rows. + * + * @return Maximum stripe size, in rows */ [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; } /** * @brief Returns the row index stride. + * + * @return Row index stride */ auto get_row_index_stride() const { @@ -487,16 +524,22 @@ class orc_writer_options { /** * @brief Returns table to be written to output. + * + * @return Table to be written to output */ [[nodiscard]] table_view get_table() const { return _table; } /** * @brief Returns associated metadata. + * + * @return Associated metadata */ [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. + * + * @return Key-Value footer metadata information */ [[nodiscard]] std::map const& get_key_value_metadata() const { @@ -508,7 +551,7 @@ class orc_writer_options { /** * @brief Sets compression type. * - * @param comp Compression type. + * @param comp Compression type */ void set_compression(compression_type comp) { _compression = comp; } @@ -520,12 +563,14 @@ class orc_writer_options { * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. * - * @param val Frequency of statistics collection. + * @param val Frequency of statistics collection */ void enable_statistics(statistics_freq val) { _stats_freq = val; } /** * @brief Sets the maximum stripe size, in bytes. + * + * @param size_bytes Maximum stripe size, in bytes to be set */ void set_stripe_size_bytes(size_t size_bytes) { @@ -538,6 +583,8 @@ class orc_writer_options { * * If the stripe size is smaller that the row group size, row group size will be reduced to math * the stripe size. + * + * @param size_rows Maximum stripe size, in rows to be set */ void set_stripe_size_rows(size_type size_rows) { @@ -549,6 +596,8 @@ class orc_writer_options { * @brief Sets the row index stride. * * Rounded down to a multiple of 8. + * + * @param stride Row index stride to be set */ void set_row_index_stride(size_type stride) { @@ -559,14 +608,14 @@ class orc_writer_options { /** * @brief Sets table to be written to output. * - * @param tbl Table for the output. + * @param tbl Table for the output */ void set_table(table_view tbl) { _table = tbl; } /** * @brief Sets associated metadata * - * @param meta Associated metadata. + * @param meta Associated metadata */ void set_metadata(table_input_metadata const* meta) { _metadata = meta; } @@ -581,6 +630,9 @@ class orc_writer_options { } }; +/** + * @brief Builds settings to use for `write_orc()`. + */ class orc_writer_options_builder { orc_writer_options options; @@ -595,8 +647,8 @@ class orc_writer_options_builder { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output */ orc_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table} { @@ -605,8 +657,8 @@ class orc_writer_options_builder { /** * @brief Sets compression type. * - * @param comp The compression type to use. - * @return this for chaining. + * @param comp The compression type to use + * @return this for chaining */ orc_writer_options_builder& compression(compression_type comp) { @@ -622,8 +674,8 @@ class orc_writer_options_builder { * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. * - * @param val Level of statistics collection. - * @return this for chaining. + * @param val Level of statistics collection + * @return this for chaining */ orc_writer_options_builder& enable_statistics(statistics_freq val) { @@ -635,7 +687,7 @@ class orc_writer_options_builder { * @brief Sets the maximum stripe size, in bytes. * * @param val maximum stripe size - * @return this for chaining. + * @return this for chaining */ orc_writer_options_builder& stripe_size_bytes(size_t val) { @@ -647,7 +699,7 @@ class orc_writer_options_builder { * @brief Sets the maximum number of rows in output stripes. * * @param val maximum number or rows - * @return this for chaining. + * @return this for chaining */ orc_writer_options_builder& stripe_size_rows(size_type val) { @@ -659,7 +711,7 @@ class orc_writer_options_builder { * @brief Sets the row index stride. * * @param val new row index stride - * @return this for chaining. + * @return this for chaining */ orc_writer_options_builder& row_index_stride(size_type val) { @@ -670,8 +722,8 @@ class orc_writer_options_builder { /** * @brief Sets table to be written to output. * - * @param tbl Table for the output. - * @return this for chaining. + * @param tbl Table for the output + * @return this for chaining */ orc_writer_options_builder& table(table_view tbl) { @@ -682,8 +734,8 @@ class orc_writer_options_builder { /** * @brief Sets associated metadata. * - * @param meta Associated metadata. - * @return this for chaining. + * @param meta Associated metadata + * @return this for chaining */ orc_writer_options_builder& metadata(table_input_metadata const* meta) { @@ -695,7 +747,7 @@ class orc_writer_options_builder { * @brief Sets Key-Value footer metadata. * * @param metadata Key-Value footer metadata - * @return this for chaining. + * @return this for chaining */ orc_writer_options_builder& key_value_metadata(std::map metadata) { @@ -712,6 +764,8 @@ class orc_writer_options_builder { * @brief move orc_writer_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `orc_writer_options` object's r-value reference */ orc_writer_options&& build() { return std::move(options); } }; @@ -729,8 +783,8 @@ class orc_writer_options_builder { * Note: Support for writing tables with struct columns is currently experimental, the output may * not be as reliable as writing for other datatypes. * - * @param options Settings for controlling reading behavior. - * @param mr Device memory resource to use for device memory allocation. + * @param options Settings for controlling reading behavior + * @param mr Device memory resource to use for device memory allocation */ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -766,7 +820,7 @@ class chunked_orc_writer_options { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. + * @param sink The sink used for writer output */ chunked_orc_writer_options(sink_info const& sink) : _sink(sink) {} @@ -781,39 +835,51 @@ class chunked_orc_writer_options { /** * @brief Create builder to create `chunked_orc_writer_options`. * - * @param sink The sink used for writer output. + * @param sink The sink used for writer output * - * @return Builder to build chunked_orc_writer_options. + * @return Builder to build chunked_orc_writer_options */ static chunked_orc_writer_options_builder builder(sink_info const& sink); /** * @brief Returns sink info. + * + * @return Sink info */ [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression type. + * + * @return Compression type */ [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns granularity of statistics collection. + * + * @return Granularity of statistics collection */ [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; } /** * @brief Returns maximum stripe size, in bytes. + * + * @return Maximum stripe size, in bytes */ [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; } /** * @brief Returns maximum stripe size, in rows. + * + * @return Maximum stripe size, in rows */ [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; } /** * @brief Returns the row index stride. + * + * @return Row index stride */ auto get_row_index_stride() const { @@ -823,11 +889,15 @@ class chunked_orc_writer_options { /** * @brief Returns associated metadata. + * + * @return Associated metadata */ [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. + * + * @return Key-Value footer metadata information */ [[nodiscard]] std::map const& get_key_value_metadata() const { @@ -839,7 +909,7 @@ class chunked_orc_writer_options { /** * @brief Sets compression type. * - * @param comp The compression type to use. + * @param comp The compression type to use */ void set_compression(compression_type comp) { _compression = comp; } @@ -851,12 +921,14 @@ class chunked_orc_writer_options { * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. * - * @param val Frequency of statistics collection. + * @param val Frequency of statistics collection */ void enable_statistics(statistics_freq val) { _stats_freq = val; } /** * @brief Sets the maximum stripe size, in bytes. + * + * @param size_bytes Maximum stripe size, in bytes to be set */ void set_stripe_size_bytes(size_t size_bytes) { @@ -869,6 +941,8 @@ class chunked_orc_writer_options { * * If the stripe size is smaller that the row group size, row group size will be reduced to math * the stripe size. + * + * @param size_rows Maximum stripe size, in rows to be set */ void set_stripe_size_rows(size_type size_rows) { @@ -880,6 +954,8 @@ class chunked_orc_writer_options { * @brief Sets the row index stride. * * Rounded down to a multiple of 8. + * + * @param stride Row index stride to be set */ void set_row_index_stride(size_type stride) { @@ -890,7 +966,7 @@ class chunked_orc_writer_options { /** * @brief Sets associated metadata. * - * @param meta Associated metadata. + * @param meta Associated metadata */ void metadata(table_input_metadata const* meta) { _metadata = meta; } @@ -905,6 +981,9 @@ class chunked_orc_writer_options { } }; +/** + * @brief Builds settings to use for `write_orc_chunked()`. + */ class chunked_orc_writer_options_builder { chunked_orc_writer_options options; @@ -919,15 +998,15 @@ class chunked_orc_writer_options_builder { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. + * @param sink The sink used for writer output */ explicit chunked_orc_writer_options_builder(sink_info const& sink) : options{sink} {} /** * @brief Sets compression type. * - * @param comp The compression type to use. - * @return this for chaining. + * @param comp The compression type to use + * @return this for chaining */ chunked_orc_writer_options_builder& compression(compression_type comp) { @@ -943,8 +1022,8 @@ class chunked_orc_writer_options_builder { * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe. * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group. * - * @param val Frequency of statistics collection. - * @return this for chaining. + * @param val Frequency of statistics collection + * @return this for chaining */ chunked_orc_writer_options_builder& enable_statistics(statistics_freq val) { @@ -956,7 +1035,7 @@ class chunked_orc_writer_options_builder { * @brief Sets the maximum stripe size, in bytes. * * @param val maximum stripe size - * @return this for chaining. + * @return this for chaining */ chunked_orc_writer_options_builder& stripe_size_bytes(size_t val) { @@ -968,7 +1047,7 @@ class chunked_orc_writer_options_builder { * @brief Sets the maximum number of rows in output stripes. * * @param val maximum number or rows - * @return this for chaining. + * @return this for chaining */ chunked_orc_writer_options_builder& stripe_size_rows(size_type val) { @@ -980,7 +1059,7 @@ class chunked_orc_writer_options_builder { * @brief Sets the row index stride. * * @param val new row index stride - * @return this for chaining. + * @return this for chaining */ chunked_orc_writer_options_builder& row_index_stride(size_type val) { @@ -991,8 +1070,8 @@ class chunked_orc_writer_options_builder { /** * @brief Sets associated metadata. * - * @param meta Associated metadata. - * @return this for chaining. + * @param meta Associated metadata + * @return this for chaining */ chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta) { @@ -1004,7 +1083,7 @@ class chunked_orc_writer_options_builder { * @brief Sets Key-Value footer metadata. * * @param metadata Key-Value footer metadata - * @return this for chaining. + * @return this for chaining */ chunked_orc_writer_options_builder& key_value_metadata( std::map metadata) @@ -1022,6 +1101,8 @@ class chunked_orc_writer_options_builder { * @brief move chunked_orc_writer_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `chunked_orc_writer_options` object's r-value reference */ chunked_orc_writer_options&& build() { return std::move(options); } }; @@ -1077,7 +1158,7 @@ class orc_chunked_writer { */ void close(); - // Unique pointer to impl writer class + /// Unique pointer to impl writer class std::unique_ptr writer; }; diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 807fab2e85c..e5b89cc0f91 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,9 +40,9 @@ namespace io { * contains one element per stripe, where each element contains column statistics for each column. */ struct raw_orc_statistics { - std::vector column_names; - std::vector file_stats; - std::vector> stripes_stats; + std::vector column_names; ///< Column names + std::vector file_stats; ///< File-level statistics for each column + std::vector> stripes_stats; ///< Stripe-level statistics for each column }; /** @@ -74,8 +74,8 @@ using no_statistics = std::monostate; */ template struct minmax_statistics { - std::optional minimum; - std::optional maximum; + std::optional minimum; ///< Minimum value + std::optional maximum; ///< Maximum value }; /** @@ -85,7 +85,7 @@ struct minmax_statistics { */ template struct sum_statistics { - std::optional sum; + std::optional sum; ///< Sum of values in column }; /** @@ -116,7 +116,7 @@ struct string_statistics : minmax_statistics, sum_statistics count; + std::vector count; ///< Count of `false` and `true` values }; /** @@ -144,8 +144,8 @@ using binary_statistics = sum_statistics; * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC. */ struct timestamp_statistics : minmax_statistics { - std::optional minimum_utc; - std::optional maximum_utc; + std::optional minimum_utc; ///< minimum in milliseconds + std::optional maximum_utc; ///< maximum in milliseconds }; namespace orc { @@ -162,7 +162,7 @@ struct column_statistics; * have additional statistics, accessible through `type_specific_stats` accessor. */ struct column_statistics { - std::optional number_of_values; + std::optional number_of_values; ///< number of statistics std::variant - type_specific_stats; + type_specific_stats; ///< type-specific statistics + /** + * @brief Construct a new column statistics object + * + * @param detail_statistics The statistics to initialize the object with + */ column_statistics(cudf::io::orc::column_statistics&& detail_statistics); }; @@ -185,9 +190,9 @@ struct column_statistics { * column. */ struct parsed_orc_statistics { - std::vector column_names; - std::vector file_stats; - std::vector> stripes_stats; + std::vector column_names; ///< column names + std::vector file_stats; ///< file-level statistics + std::vector> stripes_stats; ///< stripe-level statistics }; /** diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index d6812559e38..27821fe5526 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -37,14 +37,11 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB -constexpr size_type default_row_group_size_rows = 1000000; -constexpr size_t default_max_page_size_bytes = 512 * 1024; -constexpr size_type default_max_page_size_rows = 20000; +constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; ///< 128MB per row group +constexpr size_type default_row_group_size_rows = 1000000; ///< 1 million rows per row group +constexpr size_t default_max_page_size_bytes = 512 * 1024; ///< 512KB per page +constexpr size_type default_max_page_size_rows = 20000; ///< 20k rows per page -/** - * @brief Builds parquet_reader_options to use for `read_parquet()`. - */ class parquet_reader_options_builder; /** @@ -73,7 +70,7 @@ class parquet_reader_options { /** * @brief Constructor from source info. * - * @param src source information used to read parquet file. + * @param src source information used to read parquet file */ explicit parquet_reader_options(source_info const& src) : _source(src) {} @@ -90,19 +87,23 @@ class parquet_reader_options { /** * @brief Creates a parquet_reader_options_builder which will build parquet_reader_options. * - * @param src Source information to read parquet file. - * @return Builder to build reader options. + * @param src Source information to read parquet file + * @return Builder to build reader options */ static parquet_reader_options_builder builder(source_info const& src); /** * @brief Returns source info. + * + * @return Source info */ [[nodiscard]] source_info const& get_source() const { return _source; } /** * @brief Returns true/false depending on whether strings should be converted to categories or * not. + * + * @return `true` if strings should be converted to categories */ [[nodiscard]] bool is_enabled_convert_strings_to_categories() const { @@ -111,45 +112,57 @@ class parquet_reader_options { /** * @brief Returns true/false depending whether to use pandas metadata or not while reading. + * + * @return `true` if pandas metadata is used while reading */ [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; } /** * @brief Returns number of rows to skip from the start. + * + * @return Number of rows to skip from the start */ [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; } /** * @brief Returns number of rows to read. + * + * @return Number of rows to read */ [[nodiscard]] size_type get_num_rows() const { return _num_rows; } /** * @brief Returns names of column to be read. + * + * @return Names of column to be read */ [[nodiscard]] std::vector const& get_columns() const { return _columns; } /** * @brief Returns list of individual row groups to be read. + * + * @return List of individual row groups to be read */ std::vector> const& get_row_groups() const { return _row_groups; } /** * @brief Returns timestamp type used to cast timestamp columns. + * + * @return Timestamp type used to cast timestamp columns */ data_type get_timestamp_type() const { return _timestamp_type; } /** * @brief Sets names of the columns to be read. * - * @param col_names Vector of column names. + * @param col_names Vector of column names */ void set_columns(std::vector col_names) { _columns = std::move(col_names); } /** * @brief Sets vector of individual row groups to read. * - * @param row_groups Vector of row groups to read. + * @param row_groups Vector of row groups to read */ void set_row_groups(std::vector> row_groups) { @@ -163,21 +176,21 @@ class parquet_reader_options { /** * @brief Sets to enable/disable conversion of strings to categories. * - * @param val Boolean value to enable/disable conversion of string columns to categories. + * @param val Boolean value to enable/disable conversion of string columns to categories */ void enable_convert_strings_to_categories(bool val) { _convert_strings_to_categories = val; } /** * @brief Sets to enable/disable use of pandas metadata to read. * - * @param val Boolean value whether to use pandas metadata. + * @param val Boolean value whether to use pandas metadata */ void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; } /** * @brief Sets number of rows to skip. * - * @param val Number of rows to skip from start. + * @param val Number of rows to skip from start */ void set_skip_rows(size_type val) { @@ -191,7 +204,7 @@ class parquet_reader_options { /** * @brief Sets number of rows to read. * - * @param val Number of rows to read after skip. + * @param val Number of rows to read after skip */ void set_num_rows(size_type val) { @@ -205,11 +218,14 @@ class parquet_reader_options { /** * @brief Sets timestamp_type used to cast timestamp columns. * - * @param type The timestamp data_type to which all timestamp columns need to be cast. + * @param type The timestamp data_type to which all timestamp columns need to be cast */ void set_timestamp_type(data_type type) { _timestamp_type = type; } }; +/** + * @brief Builds parquet_reader_options to use for `read_parquet()`. + */ class parquet_reader_options_builder { parquet_reader_options options; @@ -224,15 +240,15 @@ class parquet_reader_options_builder { /** * @brief Constructor from source info. * - * @param src The source information used to read parquet file. + * @param src The source information used to read parquet file */ explicit parquet_reader_options_builder(source_info const& src) : options(src) {} /** * @brief Sets names of the columns to be read. * - * @param col_names Vector of column names. - * @return this for chaining. + * @param col_names Vector of column names + * @return this for chaining */ parquet_reader_options_builder& columns(std::vector col_names) { @@ -243,8 +259,8 @@ class parquet_reader_options_builder { /** * @brief Sets vector of individual row groups to read. * - * @param row_groups Vector of row groups to read. - * @return this for chaining. + * @param row_groups Vector of row groups to read + * @return this for chaining */ parquet_reader_options_builder& row_groups(std::vector> row_groups) { @@ -255,8 +271,8 @@ class parquet_reader_options_builder { /** * @brief Sets enable/disable conversion of strings to categories. * - * @param val Boolean value to enable/disable conversion of string columns to categories. - * @return this for chaining. + * @param val Boolean value to enable/disable conversion of string columns to categories + * @return this for chaining */ parquet_reader_options_builder& convert_strings_to_categories(bool val) { @@ -267,8 +283,8 @@ class parquet_reader_options_builder { /** * @brief Sets to enable/disable use of pandas metadata to read. * - * @param val Boolean value whether to use pandas metadata. - * @return this for chaining. + * @param val Boolean value whether to use pandas metadata + * @return this for chaining */ parquet_reader_options_builder& use_pandas_metadata(bool val) { @@ -279,8 +295,8 @@ class parquet_reader_options_builder { /** * @brief Sets number of rows to skip. * - * @param val Number of rows to skip from start. - * @return this for chaining. + * @param val Number of rows to skip from start + * @return this for chaining */ parquet_reader_options_builder& skip_rows(size_type val) { @@ -291,8 +307,8 @@ class parquet_reader_options_builder { /** * @brief Sets number of rows to read. * - * @param val Number of rows to read after skip. - * @return this for chaining. + * @param val Number of rows to read after skip + * @return this for chaining */ parquet_reader_options_builder& num_rows(size_type val) { @@ -303,8 +319,8 @@ class parquet_reader_options_builder { /** * @brief timestamp_type used to cast timestamp columns. * - * @param type The timestamp data_type to which all timestamp columns need to be cast. - * @return this for chaining. + * @param type The timestamp data_type to which all timestamp columns need to be cast + * @return this for chaining */ parquet_reader_options_builder& timestamp_type(data_type type) { @@ -321,6 +337,8 @@ class parquet_reader_options_builder { * @brief move parquet_reader_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `parquet_reader_options` object's r-value reference */ parquet_reader_options&& build() { return std::move(options); } }; @@ -352,9 +370,6 @@ table_with_metadata read_parquet( * @file */ -/** - * @brief Class to build `parquet_writer_options`. - */ class parquet_writer_options_builder; /** @@ -392,15 +407,15 @@ class parquet_writer_options { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output */ explicit parquet_writer_options(sink_info const& sink, table_view const& table) : _sink(sink), _table(table) { } - friend class parquet_writer_options_builder; + friend parquet_writer_options_builder; public: /** @@ -413,52 +428,66 @@ class parquet_writer_options { /** * @brief Create builder to create `parquet_writer_options`. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output * - * @return Builder to build parquet_writer_options. + * @return Builder to build parquet_writer_options */ static parquet_writer_options_builder builder(sink_info const& sink, table_view const& table); /** * @brief Create builder to create `parquet_writer_options`. * - * @return parquet_writer_options_builder. + * @return parquet_writer_options_builder */ static parquet_writer_options_builder builder(); /** * @brief Returns sink info. + * + * @return Sink info */ [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression format used. + * + * @return Compression format */ [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns level of statistics requested in output file. + * + * @return level of statistics requested in output file */ [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } /** * @brief Returns table_view. + * + * @return Table view */ [[nodiscard]] table_view get_table() const { return _table; } /** * @brief Returns partitions. + * + * @return Partitions */ [[nodiscard]] std::vector const& get_partitions() const { return _partitions; } /** * @brief Returns associated metadata. + * + * @return Associated metadata */ [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. + * + * @return Key-Value footer metadata information */ std::vector> const& get_key_value_metadata() const { @@ -467,11 +496,15 @@ class parquet_writer_options { /** * @brief Returns `true` if timestamps will be written as INT96 + * + * @return `true` if timestamps will be written as INT96 */ bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } /** * @brief Returns Column chunks file paths to be set in the raw output metadata. + * + * @return Column chunks file paths to be set in the raw output metadata */ std::vector const& get_column_chunks_file_paths() const { @@ -480,11 +513,15 @@ class parquet_writer_options { /** * @brief Returns maximum row group size, in bytes. + * + * @return Maximum row group size, in bytes */ auto get_row_group_size_bytes() const { return _row_group_size_bytes; } /** * @brief Returns maximum row group size, in rows. + * + * @return Maximum row group size, in rows */ auto get_row_group_size_rows() const { return _row_group_size_rows; } @@ -522,7 +559,7 @@ class parquet_writer_options { /** * @brief Sets metadata. * - * @param metadata Associated metadata. + * @param metadata Associated metadata */ void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; } @@ -541,14 +578,14 @@ class parquet_writer_options { /** * @brief Sets the level of statistics. * - * @param sf Level of statistics requested in the output file. + * @param sf Level of statistics requested in the output file */ void set_stats_level(statistics_freq sf) { _stats_level = sf; } /** * @brief Sets compression type. * - * @param compression The compression type to use. + * @param compression The compression type to use */ void set_compression(compression_type compression) { _compression = compression; } @@ -575,6 +612,8 @@ class parquet_writer_options { /** * @brief Sets the maximum row group size, in bytes. + * + * @param size_bytes Maximum row group size, in bytes to set */ void set_row_group_size_bytes(size_t size_bytes) { @@ -586,6 +625,8 @@ class parquet_writer_options { /** * @brief Sets the maximum row group size, in rows. + * + * @param size_rows Maximum row group size, in rows to set */ void set_row_group_size_rows(size_type size_rows) { @@ -616,6 +657,9 @@ class parquet_writer_options { } }; +/** + * @brief Class to build `parquet_writer_options`. + */ class parquet_writer_options_builder { parquet_writer_options options; @@ -630,8 +674,8 @@ class parquet_writer_options_builder { /** * @brief Constructor from sink and table. * - * @param sink The sink used for writer output. - * @param table Table to be written to output. + * @param sink The sink used for writer output + * @param table Table to be written to output */ explicit parquet_writer_options_builder(sink_info const& sink, table_view const& table) : options(sink, table) @@ -643,7 +687,7 @@ class parquet_writer_options_builder { * * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must * be same size as number of sinks in sink_info - * @return this for chaining. + * @return this for chaining */ parquet_writer_options_builder& partitions(std::vector partitions) { @@ -656,8 +700,8 @@ class parquet_writer_options_builder { /** * @brief Sets metadata in parquet_writer_options. * - * @param metadata Associated metadata. - * @return this for chaining. + * @param metadata Associated metadata + * @return this for chaining */ parquet_writer_options_builder& metadata(table_input_metadata const* metadata) { @@ -669,7 +713,7 @@ class parquet_writer_options_builder { * @brief Sets Key-Value footer metadata in parquet_writer_options. * * @param metadata Key-Value footer metadata - * @return this for chaining. + * @return this for chaining */ parquet_writer_options_builder& key_value_metadata( std::vector> metadata) @@ -683,8 +727,8 @@ class parquet_writer_options_builder { /** * @brief Sets the level of statistics in parquet_writer_options. * - * @param sf Level of statistics requested in the output file. - * @return this for chaining. + * @param sf Level of statistics requested in the output file + * @return this for chaining */ parquet_writer_options_builder& stats_level(statistics_freq sf) { @@ -695,8 +739,8 @@ class parquet_writer_options_builder { /** * @brief Sets compression type in parquet_writer_options. * - * @param compression The compression type to use. - * @return this for chaining. + * @param compression The compression type to use + * @return this for chaining */ parquet_writer_options_builder& compression(compression_type compression) { @@ -709,7 +753,7 @@ class parquet_writer_options_builder { * * @param file_paths Vector of Strings which indicates file path. Must be same size as number of * data sinks - * @return this for chaining. + * @return this for chaining */ parquet_writer_options_builder& column_chunks_file_paths(std::vector file_paths) { @@ -723,7 +767,7 @@ class parquet_writer_options_builder { * @brief Sets the maximum row group size, in bytes. * * @param val maximum row group size - * @return this for chaining. + * @return this for chaining */ parquet_writer_options_builder& row_group_size_bytes(size_t val) { @@ -734,8 +778,8 @@ class parquet_writer_options_builder { /** * @brief Sets the maximum number of rows in output row groups. * - * @param val maximum number of rows - * @return this for chaining. + * @param val maximum number or rows + * @return this for chaining */ parquet_writer_options_builder& row_group_size_rows(size_type val) { @@ -749,7 +793,7 @@ class parquet_writer_options_builder { * bytes, and will be adjusted to match if it is. * * @param val maximum page size - * @return this for chaining. + * @return this for chaining */ parquet_writer_options_builder& max_page_size_bytes(size_t val) { @@ -762,7 +806,7 @@ class parquet_writer_options_builder { * Cannot be larger than the row group size in rows, and will be adjusted to match if it is. * * @param val maximum rows per page - * @return this for chaining. + * @return this for chaining */ parquet_writer_options_builder& max_page_size_rows(size_type val) { @@ -773,8 +817,8 @@ class parquet_writer_options_builder { /** * @brief Sets whether int96 timestamps are written or not in parquet_writer_options. * - * @param enabled Boolean value to enable/disable int96 timestamps. - * @return this for chaining. + * @param enabled Boolean value to enable/disable int96 timestamps + * @return this for chaining */ parquet_writer_options_builder& int96_timestamps(bool enabled) { @@ -791,6 +835,8 @@ class parquet_writer_options_builder { * @brief move parquet_writer_options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `parquet_writer_options` object's r-value reference */ parquet_writer_options&& build() { return std::move(options); } }; @@ -805,8 +851,8 @@ class parquet_writer_options_builder { * cudf::io::write_parquet(options); * @endcode * - * @param options Settings for controlling writing behavior. - * @param mr Device memory resource to use for device memory allocation. + * @param options Settings for controlling writing behavior + * @param mr Device memory resource to use for device memory allocation * * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if * requested in parquet_writer_options (empty blob otherwise). @@ -822,15 +868,12 @@ std::unique_ptr> write_parquet( * * @ingroup io_writers * - * @param[in] metadata_list List of input file metadata. - * @return A parquet-compatible blob that contains the data for all row groups in the list. + * @param[in] metadata_list List of input file metadata + * @return A parquet-compatible blob that contains the data for all row groups in the list */ std::unique_ptr> merge_row_group_metadata( const std::vector>>& metadata_list); -/** - * @brief Builds options for chunked_parquet_writer_options. - */ class chunked_parquet_writer_options_builder; /** @@ -862,7 +905,7 @@ class chunked_parquet_writer_options { /** * @brief Constructor from sink. * - * @param sink Sink used for writer output. + * @param sink Sink used for writer output */ explicit chunked_parquet_writer_options(sink_info const& sink) : _sink(sink) {} @@ -878,26 +921,36 @@ class chunked_parquet_writer_options { /** * @brief Returns sink info. + * + * @return Sink info */ [[nodiscard]] sink_info const& get_sink() const { return _sink; } /** * @brief Returns compression format used. + * + * @return Compression format */ [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns level of statistics requested in output file. + * + * @return Level of statistics requested in output file */ [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } /** * @brief Returns metadata information. + * + * @return Metadata information */ [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. + * + * @return Key-Value footer metadata information */ std::vector> const& get_key_value_metadata() const { @@ -906,16 +959,22 @@ class chunked_parquet_writer_options { /** * @brief Returns `true` if timestamps will be written as INT96 + * + * @return `true` if timestamps will be written as INT96 */ bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } /** * @brief Returns maximum row group size, in bytes. + * + * @return Maximum row group size, in bytes */ auto get_row_group_size_bytes() const { return _row_group_size_bytes; } /** * @brief Returns maximum row group size, in rows. + * + * @return Maximum row group size, in rows */ auto get_row_group_size_rows() const { return _row_group_size_rows; } @@ -940,7 +999,7 @@ class chunked_parquet_writer_options { /** * @brief Sets metadata. * - * @param metadata Associated metadata. + * @param metadata Associated metadata */ void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; } @@ -959,14 +1018,14 @@ class chunked_parquet_writer_options { /** * @brief Sets the level of statistics in parquet_writer_options. * - * @param sf Level of statistics requested in the output file. + * @param sf Level of statistics requested in the output file */ void set_stats_level(statistics_freq sf) { _stats_level = sf; } /** * @brief Sets compression type. * - * @param compression The compression type to use. + * @param compression The compression type to use */ void set_compression(compression_type compression) { _compression = compression; } @@ -980,6 +1039,8 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum row group size, in bytes. + * + * @param size_bytes Maximum row group size, in bytes to set */ void set_row_group_size_bytes(size_t size_bytes) { @@ -991,6 +1052,8 @@ class chunked_parquet_writer_options { /** * @brief Sets the maximum row group size, in rows. + * + * @param size_rows The maximum row group size, in rows to set */ void set_row_group_size_rows(size_type size_rows) { @@ -1023,13 +1086,16 @@ class chunked_parquet_writer_options { /** * @brief creates builder to build chunked_parquet_writer_options. * - * @param sink sink to use for writer output. + * @param sink sink to use for writer output * - * @return Builder to build `chunked_parquet_writer_options`. + * @return Builder to build `chunked_parquet_writer_options` */ static chunked_parquet_writer_options_builder builder(sink_info const& sink); }; +/** + * @brief Builds options for chunked_parquet_writer_options. + */ class chunked_parquet_writer_options_builder { chunked_parquet_writer_options options; @@ -1044,15 +1110,15 @@ class chunked_parquet_writer_options_builder { /** * @brief Constructor from sink. * - * @param sink The sink used for writer output. + * @param sink The sink used for writer output */ chunked_parquet_writer_options_builder(sink_info const& sink) : options(sink){}; /** * @brief Sets metadata to chunked_parquet_writer_options. * - * @param metadata Associated metadata. - * @return this for chaining. + * @param metadata Associated metadata + * @return this for chaining */ chunked_parquet_writer_options_builder& metadata(table_input_metadata const* metadata) { @@ -1064,7 +1130,7 @@ class chunked_parquet_writer_options_builder { * @brief Sets Key-Value footer metadata in parquet_writer_options. * * @param metadata Key-Value footer metadata - * @return this for chaining. + * @return this for chaining */ chunked_parquet_writer_options_builder& key_value_metadata( std::vector> metadata) @@ -1078,8 +1144,8 @@ class chunked_parquet_writer_options_builder { /** * @brief Sets Sets the level of statistics in chunked_parquet_writer_options. * - * @param sf Level of statistics requested in the output file. - * @return this for chaining. + * @param sf Level of statistics requested in the output file + * @return this for chaining */ chunked_parquet_writer_options_builder& stats_level(statistics_freq sf) { @@ -1090,8 +1156,8 @@ class chunked_parquet_writer_options_builder { /** * @brief Sets compression type to chunked_parquet_writer_options. * - * compression The compression type to use. - * @return this for chaining. + * @param compression The compression type to use + * @return this for chaining */ chunked_parquet_writer_options_builder& compression(compression_type compression) { @@ -1105,8 +1171,8 @@ class chunked_parquet_writer_options_builder { * not an internal type for cudf, it needs to be written for backwards * compatibility reasons. * - * @param enabled Boolean value to enable/disable int96 timestamps. - * @return this for chaining. + * @param enabled Boolean value to enable/disable int96 timestamps + * @return this for chaining */ chunked_parquet_writer_options_builder& int96_timestamps(bool enabled) { @@ -1118,7 +1184,7 @@ class chunked_parquet_writer_options_builder { * @brief Sets the maximum row group size, in bytes. * * @param val maximum row group size - * @return this for chaining. + * @return this for chaining */ chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val) { @@ -1129,8 +1195,8 @@ class chunked_parquet_writer_options_builder { /** * @brief Sets the maximum number of rows in output row groups. * - * @param val maximum number of rows - * @return this for chaining. + * @param val maximum number or rows + * @return this for chaining */ chunked_parquet_writer_options_builder& row_group_size_rows(size_type val) { @@ -1144,7 +1210,7 @@ class chunked_parquet_writer_options_builder { * bytes, and will be adjusted to match if it is. * * @param val maximum page size - * @return this for chaining. + * @return this for chaining */ chunked_parquet_writer_options_builder& max_page_size_bytes(size_t val) { @@ -1157,7 +1223,7 @@ class chunked_parquet_writer_options_builder { * Cannot be larger than the row group size in rows, and will be adjusted to match if it is. * * @param val maximum rows per page - * @return this for chaining. + * @return this for chaining */ chunked_parquet_writer_options_builder& max_page_size_rows(size_type val) { @@ -1174,6 +1240,8 @@ class chunked_parquet_writer_options_builder { * @brief move chunked_parquet_writer_options member once it's is built. * * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `chunked_parquet_writer_options` object's r-value reference */ chunked_parquet_writer_options&& build() { return std::move(options); } }; @@ -1239,7 +1307,7 @@ class parquet_chunked_writer { std::unique_ptr> close( std::vector const& column_chunks_file_paths = {}); - // Unique pointer to impl writer class + /// Unique pointer to impl writer class std::unique_ptr writer; };