From c87073737b6ffef9715549a199499b92630e8e5f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 29 Apr 2024 11:32:01 -0400 Subject: [PATCH 001/105] MINOR: [R] refactor arrow_mask to include aggregations list (#41414) ### Rationale for this change Keeping the `..aggregations` list in parent.frame felt a little wrong. As we're starting to use this in more places (like mutate in #41350, and potentially more places), I wanted to try to improve this. I tried a bunch of things before to put it somewhere better (like in the mask) but failed. Finally I found one that worked. ### What changes are included in this PR? Just a refactor ### Are these changes tested? Existing tests pass. ### Are there any user-facing changes? Nope. --- r/R/dplyr-eval.R | 8 +++----- r/R/dplyr-funcs-agg.R | 23 ++++++++++++----------- r/R/dplyr-summarize.R | 41 ++++++++++++++++++----------------------- 3 files changed, 33 insertions(+), 39 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 3aaa29696b8c8..ff1619ce944d0 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -125,13 +125,9 @@ arrow_mask <- function(.data, aggregation = FALSE) { f_env <- new_environment(.cache$functions) if (aggregation) { - # Add the aggregation functions to the environment, and set the enclosing - # environment to the parent frame so that, when called from summarize_eval(), - # they can reference and assign into `..aggregations` defined there. - pf <- parent.frame() + # Add the aggregation functions to the environment. for (f in names(agg_funcs)) { f_env[[f]] <- agg_funcs[[f]] - environment(f_env[[f]]) <- pf } } else { # Add functions that need to error hard and clear. @@ -156,6 +152,8 @@ arrow_mask <- function(.data, aggregation = FALSE) { # TODO: figure out what rlang::as_data_pronoun does/why we should use it # (because if we do we get `Error: Can't modify the data pronoun` in mutate()) out$.data <- .data$selected_columns + # Add the aggregations list to collect any that get pulled out when evaluating + out$.aggregations <- empty_named_list() out } diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index ab1df1d2f15a5..d84f8f28f0dff 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -17,7 +17,7 @@ # Aggregation functions # -# These all insert into an ..aggregations list (in a parent frame) a list containing: +# These all insert into an .aggregations list in the mask, a list containing: # @param fun string function name # @param data list of 0 or more Expressions # @param options list of function options, as passed to call_function @@ -154,11 +154,11 @@ register_bindings_aggregate <- function() { set_agg <- function(...) { agg_data <- list2(...) - # Find the environment where ..aggregations is stored + # Find the environment where .aggregations is stored target <- find_aggregations_env() - aggs <- get("..aggregations", target) + aggs <- get(".aggregations", target) lapply(agg_data[["data"]], function(expr) { - # If any of the fields referenced in the expression are in ..aggregations, + # If any of the fields referenced in the expression are in .aggregations, # then we can't aggregate over them. # This is mainly for combinations of dataset columns and aggregations, # like sum(x - mean(x)), i.e. window functions. @@ -169,23 +169,24 @@ set_agg <- function(...) { } }) - # Record the (fun, data, options) in ..aggregations + # Record the (fun, data, options) in .aggregations # and return a FieldRef pointing to it tmpname <- paste0("..temp", length(aggs)) aggs[[tmpname]] <- agg_data - assign("..aggregations", aggs, envir = target) + assign(".aggregations", aggs, envir = target) Expression$field_ref(tmpname) } find_aggregations_env <- function() { - # Find the environment where ..aggregations is stored, + # Find the environment where .aggregations is stored, # it's in parent.env of something in the call stack - for (f in sys.frames()) { - if (exists("..aggregations", envir = f)) { - return(f) + n <- 1 + while (TRUE) { + if (exists(".aggregations", envir = caller_env(n))) { + return(caller_env(n)) } + n <- n + 1 } - stop("Could not find ..aggregations") } ensure_one_arg <- function(args, fun) { diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 5bb81dc2b34fc..56de14db6dd44 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -80,34 +80,32 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # ExecNode), and in the expressions, replace them with FieldRefs so that # further operations can happen (in what will become a ProjectNode that works # on the result of the Aggregate). - # To do this, we create a list in this function scope, and in arrow_mask(), - # and we make sure this environment here is the parent env of the binding - # functions, so that when they receive an expression, they can pull out - # aggregations and insert them into the list, which they can find because it - # is in the parent env. + # To do this, arrow_mask() includes a list called .aggregations, + # and the aggregation functions will pull out those terms and insert into + # that list. # nolint end - ..aggregations <- empty_named_list() - - # We'll collect any transformations after the aggregation here - ..post_mutate <- empty_named_list() mask <- arrow_mask(.data, aggregation = TRUE) + # We'll collect any transformations after the aggregation here. + # summarize_eval() returns NULL when the outer expression is an aggregation, + # i.e. there is no projection to do after + post_mutate <- empty_named_list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated # (which overwrites the previous name) name <- names(exprs)[i] - ..post_mutate[[name]] <- summarize_eval(name, exprs[[i]], mask) + post_mutate[[name]] <- summarize_eval(name, exprs[[i]], mask) } # Apply the results to the .data object. # First, the aggregations - .data$aggregations <- ..aggregations + .data$aggregations <- mask$.aggregations # Then collapse the query so that the resulting query object can have # additional operations applied to it out <- collapse.arrow_dplyr_query(.data) - # Now, add the projections in ..post_mutate (if any) - for (post in names(..post_mutate)) { + # Now, add the projections in post_mutate (if any) + for (post in names(post_mutate)) { # One last check: it's possible that an expression like y - mean(y) would # successfully evaluate, but it's not supported. It gets transformed to: # nolint start @@ -121,7 +119,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # We can tell the expression is invalid if it references fields not in # the schema of the data after summarize(). Evaulating its type will # throw an error if it's invalid. - tryCatch(..post_mutate[[post]]$type(out$.data$schema), error = function(e) { + tryCatch(post_mutate[[post]]$type(out$.data$schema), error = function(e) { msg <- paste( "Expression", as_label(exprs[[post]]), "is not a valid aggregation expression or is" @@ -129,7 +127,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { arrow_not_supported(msg) }) # If it's valid, add it to the .data object - out$selected_columns[[post]] <- ..post_mutate[[post]] + out$selected_columns[[post]] <- post_mutate[[post]] } # Make sure column order is correct (and also drop ..temp columns) @@ -266,10 +264,10 @@ format_aggregation <- function(x) { # This function evaluates an expression and returns the post-summarize # projection that results, or NULL if there is none because the top-level # expression was an aggregation. Any aggregations are pulled out and collected -# in the ..aggregations list outside this function. +# in the .aggregations list outside this function. summarize_eval <- function(name, quosure, mask) { # Add previous aggregations to the mask, so they can be referenced - for (n in names(get("..aggregations", parent.frame()))) { + for (n in names(mask$.aggregations)) { mask[[n]] <- mask$.data[[n]] <- Expression$field_ref(n) } # Evaluate: @@ -286,14 +284,11 @@ summarize_eval <- function(name, quosure, mask) { # Handle case where outer expr is ..temp field ref. This came from an # aggregation at the top level. So the resulting name should be `name`. # not `..tempN`. Rename the corresponding aggregation. - post_aggs <- get("..aggregations", parent.frame()) result_field_name <- value$field_name - if (result_field_name %in% names(post_aggs)) { + if (result_field_name %in% names(mask$.aggregations)) { # Do this by assigning over `name` in case something else was in `name` - post_aggs[[name]] <- post_aggs[[result_field_name]] - post_aggs[[result_field_name]] <- NULL - # Assign back into the parent environment - assign("..aggregations", post_aggs, parent.frame()) + mask$.aggregations[[name]] <- mask$.aggregations[[result_field_name]] + mask$.aggregations[[result_field_name]] <- NULL # Return NULL because there is no post-mutate projection, it's just # the aggregation return(NULL) From e3db586eb343e80dce58d8cbf6eef91aba14dfff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 11:33:30 -0400 Subject: [PATCH 002/105] MINOR: [Go] Bump github.com/apache/thrift from 0.19.0 to 0.20.0 in /go (#40777) Bumps [github.com/apache/thrift](https://github.com/apache/thrift) from 0.19.0 to 0.20.0.
Release notes

Sourced from github.com/apache/thrift's releases.

Version 0.20.0

Please head over to the official release download source: http://thrift.apache.org/download

The assets listed below are added by Github based on the release tag and they will therefore not match the checkums published on the Thrift project website.

Changelog

Sourced from github.com/apache/thrift's changelog.

0.20.0

Known Open Issues (Blocker or Critical)

  • THRIFT-3877 - C++ library don't work with HTTP (csharp server, cpp client; need cross test enhancement)
  • THRIFT-5468 - Swift service generator doesn't support oneway
  • THRIFT-5654 - LNK4042 and LNK2019 in go_validator_generator.cc

Build Process

C++

Compiler (General)

Delphi

Documentation

Erlang

Go

Haxe

Java

netstd

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/apache/thrift&package-manager=go_modules&previous-version=0.19.0&new-version=0.20.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 972940ee3c299..79c3cc3981231 100644 --- a/go/go.mod +++ b/go/go.mod @@ -21,7 +21,7 @@ go 1.21 require ( github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c github.com/andybalholm/brotli v1.1.0 - github.com/apache/thrift v0.19.0 + github.com/apache/thrift v0.20.0 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 github.com/goccy/go-json v0.10.2 github.com/golang/snappy v0.0.4 diff --git a/go/go.sum b/go/go.sum index 0a45cb751f77e..e8c2fde15181a 100644 --- a/go/go.sum +++ b/go/go.sum @@ -8,8 +8,8 @@ github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= -github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk= -github.com/apache/thrift v0.19.0/go.mod h1:SUALL216IiaOw2Oy+5Vs9lboJ/t9g40C+G07Dc0QC1I= +github.com/apache/thrift v0.20.0 h1:631+KvYbsBZxmuJjYwhezVsrfc/TbqtZV4QcxOX1fOI= +github.com/apache/thrift v0.20.0/go.mod h1:hOk1BQqcp2OLzGsyVXdfMk7YFlMxK3aoEVhjD06QhB8= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= From 00df70c6dca6b7cf9a274e131ea88ed588133aec Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Mon, 29 Apr 2024 19:25:35 +0100 Subject: [PATCH 003/105] GH-41398: [R][CI] Windows job failing after R 4.4 release (#41409) ### Rationale for this change We can't throw warnings on cran. ### What changes are included in this PR? Update function to match changes in libarrow added in GH-39864 ### Are these changes tested? CI ### Are there any user-facing changes? No * GitHub Issue: #41398 Authored-by: Jacob Wujciak-Jens Signed-off-by: Jacob Wujciak-Jens --- r/src/extension-impl.cpp | 8 +++++++- r/src/extension.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index a13b252b2832f..14c771cc98e4f 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -87,7 +87,9 @@ arrow::Result> RExtensionType::Deserialize( return std::shared_ptr(cloned.release()); } -std::string RExtensionType::ToString() const { +std::string RExtensionType::ToString() const { return ToString(false); } + +std::string RExtensionType::ToString(bool show_metadata) const { arrow::Result result = SafeCallIntoR([&]() { cpp11::environment instance = r6_instance(); cpp11::function instance_ToString(instance["ToString"]); @@ -98,7 +100,11 @@ std::string RExtensionType::ToString() const { // In the event of an error (e.g., we are not on the main thread // and we are not inside RunWithCapturedR()), just call the default method if (!result.ok()) { +#if ARROW_VERSION_MAJOR >= 16 + return ExtensionType::ToString(show_metadata); +#else return ExtensionType::ToString(); +#endif } else { return result.ValueUnsafe(); } diff --git a/r/src/extension.h b/r/src/extension.h index fbd3ad484691a..6e6c6f7c29761 100644 --- a/r/src/extension.h +++ b/r/src/extension.h @@ -52,6 +52,8 @@ class RExtensionType : public arrow::ExtensionType { std::string Serialize() const { return extension_metadata_; } + std::string ToString(bool show_metadata = false) const; + // wrapper for libarrow < 16 std::string ToString() const; cpp11::sexp Convert(const std::shared_ptr& array) const; From 2ef4059566eb3dfc5cceb85d8ea8fa83e33234bb Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 29 Apr 2024 17:19:41 -0400 Subject: [PATCH 004/105] GH-29537: [R] Support mutate/summarize with implicit join (#41350) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Since it doesn't look like Acero will be getting window functions any time soon, implement support in `mutate()` for transformations that involve aggregations, like `x - mean(x)`, via left_join. ### What changes are included in this PR? Following #41223, I realized I could reuse that evaluation path in `mutate()`. Evaluating expressions accumulates `..aggregations` and `mutate_stuff`; in summarize() we apply aggregations and then mutate on the result. If expressions in the `mutate_stuff` reference columns in the original data and not just the result of aggregations, we reject it. Here, if there are aggregations, we apply them on a copy of the query up to that point, and join the result back onto the query, then apply the mutations on that. It's not a problem for those mutate expressions to reference both columns in the original data and the results of the aggregations because both are present. There are ~three~ two caveats: * Join has non-deterministic order, so while `mutate()` doesn't generally affect row order, if this code path is activated, row order may not be stable. With datasets, it's not guaranteed anyway. * ~Acero's join seems to have a limitation currently where missing values are not joined to each other. If your join key has NA in it, and you do a left_join, your new columns will all be NA, even if there is a corresponding value in the right dataset. I made https://github.com/apache/arrow/issues/41358 to address that, and in the meantime, I've added a workaround (https://github.com/apache/arrow/pull/41350/commits/b9de50452e926fe5f39aeb3887a04e203302b960) that's not awesome but has the right behavior.~ Fixed and rebased. * I believe it is possible in dplyr to get this behavior in other verbs: filter, arrange, even summarize. I've only done this for mutate. Are we ok with that? ### Are these changes tested? Yes ### Are there any user-facing changes? This works now: ``` r library(arrow) library(dplyr) mtcars |> arrow_table() |> select(cyl, mpg, hp) |> group_by(cyl) |> mutate(stdize_mpg = (mpg - mean(mpg)) / sd(mpg)) |> collect() #> # A tibble: 32 × 4 #> # Groups: cyl [3] #> cyl mpg hp stdize_mpg #> #> 1 6 21 110 0.865 #> 2 6 21 110 0.865 #> 3 4 22.8 93 -0.857 #> 4 6 21.4 110 1.14 #> 5 8 18.7 175 1.41 #> 6 6 18.1 105 -1.13 #> 7 8 14.3 245 -0.312 #> 8 4 24.4 62 -0.502 #> 9 4 22.8 95 -0.857 #> 10 6 19.2 123 -0.373 #> # ℹ 22 more rows ``` Created on 2024-04-23 with [reprex v2.1.0](https://reprex.tidyverse.org) * GitHub Issue: #29537 --- r/R/arrow-package.R | 5 +-- r/R/dplyr-funcs-agg.R | 1 - r/R/dplyr-funcs-doc.R | 2 +- r/R/dplyr-mutate.R | 39 ++++++++++++------ r/man/acero.Rd | 2 +- r/tests/testthat/test-dataset-dplyr.R | 11 ------ r/tests/testthat/test-dplyr-mutate.R | 57 ++++++++++++--------------- r/vignettes/data_wrangling.Rmd | 28 +------------ 8 files changed, 58 insertions(+), 87 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7087a40c4903a..44dfbbcd5c7e7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -48,10 +48,7 @@ supported_dplyr_methods <- list( group_vars = NULL, group_by_drop_default = NULL, ungroup = NULL, - mutate = c( - "window functions (e.g. things that require aggregation within groups)", - "not currently supported" - ), + mutate = NULL, transmute = NULL, arrange = NULL, rename = NULL, diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index d84f8f28f0dff..9411ce5ce6faf 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -164,7 +164,6 @@ set_agg <- function(...) { # like sum(x - mean(x)), i.e. window functions. # This will reject (sum(sum(x)) as well, but that's not a useful operation. if (any(expr$field_names_in_expression() %in% names(aggs))) { - # TODO: support in ARROW-13926 arrow_not_supported("aggregate within aggregate expression") } }) diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index fda77bca83fc2..7f0627c33d010 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -53,7 +53,7 @@ #' * [`groups()`][dplyr::groups()] #' * [`inner_join()`][dplyr::inner_join()]: the `copy` argument is ignored #' * [`left_join()`][dplyr::left_join()]: the `copy` argument is ignored -#' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported +#' * [`mutate()`][dplyr::mutate()] #' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally. #' * [`relocate()`][dplyr::relocate()] #' * [`rename()`][dplyr::rename()] diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 287532dee08a9..880f7799e6316 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -45,17 +45,11 @@ mutate.arrow_dplyr_query <- function(.data, return(out) } - # Restrict the cases we support for now - has_aggregations <- any(unlist(lapply(exprs, all_funs)) %in% names(agg_funcs)) - if (has_aggregations) { - # ARROW-13926 - # mutate() on a grouped dataset does calculations within groups - # This doesn't matter on scalar ops (arithmetic etc.) but it does - # for things with aggregations (e.g. subtracting the mean) - return(abandon_ship(call, .data, "window functions not currently supported in Arrow")) - } - - mask <- arrow_mask(out) + # Create a mask with aggregation functions in it + # If there are any aggregations, we will need to compute them and + # and join the results back in, for "window functions" like x - mean(x) + mask <- arrow_mask(out, aggregation = TRUE) + # Evaluate the mutate expressions results <- list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated @@ -81,6 +75,24 @@ mutate.arrow_dplyr_query <- function(.data, mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] } + if (length(mask$.aggregations)) { + # Make a copy of .data, do the aggregations on it, and then left_join on + # the group_by variables. + agg_query <- as_adq(.data) + # These may be computed by .by, make sure they're set + agg_query$group_by_vars <- grv + agg_query$aggregations <- mask$.aggregations + agg_query <- collapse.arrow_dplyr_query(agg_query) + if (length(grv)) { + out <- left_join(out, agg_query, by = grv) + } else { + # If there are no group_by vars, add a scalar column to both and join on that + agg_query$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) + out$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) + out <- left_join(out, agg_query, by = "..tempjoin") + } + } + old_vars <- names(out$selected_columns) # Note that this is names(exprs) not names(results): # if results$new_var is NULL, that means we are supposed to remove it @@ -91,6 +103,11 @@ mutate.arrow_dplyr_query <- function(.data, out$selected_columns[[new_var]] <- results[[new_var]] } + # Prune any ..temp columns from the result, which would have come from + # .aggregations + temps <- grepl("^\\.\\.temp", names(out$selected_columns)) + out$selected_columns <- out$selected_columns[!temps] + # Deduplicate new_vars and remove NULL columns from new_vars new_vars <- intersect(union(new_vars, grv), names(out$selected_columns)) diff --git a/r/man/acero.Rd b/r/man/acero.Rd index ca51ef56334eb..9ef9cd7dda6fb 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -40,7 +40,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:group_data]{groups()}} \item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} argument is ignored -\item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported +\item \code{\link[dplyr:mutate]{mutate()}} \item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally. \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index b8d93841921d7..1e36ea8bd4966 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -163,17 +163,6 @@ See $.data for the source Arrow object", ) }) -test_that("mutate() features not yet implemented", { - ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) - expect_error( - ds %>% - group_by(int) %>% - mutate(avg = mean(int)), - "window functions not currently supported in Arrow\nCall collect() first to pull data into R.", - fixed = TRUE - ) -}) - test_that("filter scalar validation doesn't crash (ARROW-7772)", { ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) expect_error( diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 0889fffedd508..71c1e52d33c1d 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -378,18 +378,16 @@ test_that("dplyr::mutate's examples", { # The mutate operation may yield different results on grouped # tibbles because the expressions are computed within groups. # The following normalises `mass` by the global average: - # TODO(ARROW-13926): support window functions compare_dplyr_binding( .input %>% select(name, mass, species) %>% mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>% collect(), - starwars, - warning = "window function" + starwars ) }) -test_that("Can mutate after group_by as long as there are no aggregations", { +test_that("Can mutate after group_by, including with some aggregations", { compare_dplyr_binding( .input %>% select(int, chr) %>% @@ -417,31 +415,31 @@ test_that("Can mutate after group_by as long as there are no aggregations", { collect(), tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(int, chr) %>% group_by(chr) %>% mutate(avg_int = mean(int)) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(mean = int, chr) %>% # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that # `all_funs()` detects `mean()` despite the collision with a column name group_by(chr) %>% mutate(avg_int = mean(mean)) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) }) -test_that("Can mutate with .by argument as long as there are no aggregations", { +test_that("Can mutate with .by argument, even with some aggregations", { compare_dplyr_binding( .input %>% select(int, chr) %>% @@ -479,25 +477,25 @@ test_that("Can mutate with .by argument as long as there are no aggregations", { collect(), tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(int, chr) %>% mutate(avg_int = mean(int), .by = chr) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(mean = int, chr) %>% # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that # `all_funs()` detects `mean()` despite the collision with a column name mutate(avg_int = mean(mean), .by = chr) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) }) @@ -682,7 +680,6 @@ test_that("mutate() and transmute() with namespaced functions", { }) test_that("Can use across() within mutate()", { - # expressions work in the right order compare_dplyr_binding( .input %>% @@ -717,17 +714,15 @@ test_that("Can use across() within mutate()", { example_data ) - # gives the right error with window functions - expect_warning( - arrow_table(example_data) %>% + compare_dplyr_binding( + .input %>% mutate( x = int + 2, across(c("int", "dbl"), list(mean = mean, sd = sd, round)), exp(dbl2) ) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + example_data ) }) diff --git a/r/vignettes/data_wrangling.Rmd b/r/vignettes/data_wrangling.Rmd index 305a91c156eb1..1d074ef0cfedb 100644 --- a/r/vignettes/data_wrangling.Rmd +++ b/r/vignettes/data_wrangling.Rmd @@ -165,33 +165,7 @@ sw2 %>% transmute(name, height, mass, res = residuals(lm(mass ~ height))) ``` -Because window functions are not supported, computing an aggregation like `mean()` on a grouped table or within a rowwise operation like `filter()` is not supported: - -```{r} -sw %>% - select(1:4) %>% - filter(!is.na(hair_color)) %>% - group_by(hair_color) %>% - filter(height < mean(height, na.rm = TRUE)) -``` - -This operation is sometimes referred to as a windowed aggregate and can be accomplished in Arrow by computing the aggregation separately, for example within a join operation: - -```{r} -sw %>% - select(1:4) %>% - filter(!is.na(hair_color)) %>% - left_join( - sw %>% - group_by(hair_color) %>% - summarize(mean_height = mean(height, na.rm = TRUE)) - ) %>% - filter(height < mean_height) %>% - select(!mean_height) %>% - collect() -``` - -Alternatively, [DuckDB](https:\www.duckdb.org) supports Arrow natively, so you can pass the `Table` object to DuckDB without paying a performance penalty using the helper function `to_duckdb()` and pass the object back to Arrow with `to_arrow()`: +For some operations, you can use [DuckDB](https://www.duckdb.org). It supports Arrow natively, so you can pass the `Dataset` or query object to DuckDB without paying a performance penalty using the helper function `to_duckdb()` and pass the object back to Arrow with `to_arrow()`: ```{r} sw %>% From d60ff53394788aef9a6070dfdf46a2bcade128ad Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 30 Apr 2024 08:46:26 +0900 Subject: [PATCH 005/105] GH-41427: [Go] Fix stateless prepared statements (#41428) ### Rationale for this change Stateless prepared statements didn't actually work ### What changes are included in this PR? Update the handle after binding parameters ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41427 Authored-by: David Li Signed-off-by: David Li --- go/arrow/flight/flightsql/client.go | 93 ++++++++++-------------- go/arrow/flight/flightsql/client_test.go | 10 +-- 2 files changed, 45 insertions(+), 58 deletions(-) diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index e594191c35fdf..c6794820dc172 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -1119,24 +1119,10 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption return nil, err } - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return nil, err - } - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return nil, err - } - if err = wr.Close(); err != nil { - return nil, err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return nil, err - } + desc, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return nil, err } - return p.client.getFlightInfo(ctx, desc, opts...) } @@ -1156,23 +1142,9 @@ func (p *PreparedStatement) ExecutePut(ctx context.Context, opts ...grpc.CallOpt return err } - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return err - } - - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return err - } - if err = wr.Close(); err != nil { - return err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return err - } + _, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return err } return nil @@ -1200,23 +1172,9 @@ func (p *PreparedStatement) ExecutePoll(ctx context.Context, retryDescriptor *fl } if retryDescriptor == nil { - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return nil, err - } - - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return nil, err - } - if err = wr.Close(); err != nil { - return nil, err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return nil, err - } + desc, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return nil, err } } return p.client.Client.PollFlightInfo(ctx, desc, opts...) @@ -1248,7 +1206,7 @@ func (p *PreparedStatement) ExecuteUpdate(ctx context.Context, opts ...grpc.Call return } if p.hasBindParameters() { - wr, err = p.writeBindParameters(pstream, desc) + wr, err = p.writeBindParametersToStream(pstream, desc) if err != nil { return } @@ -1283,7 +1241,36 @@ func (p *PreparedStatement) hasBindParameters() bool { return (p.paramBinding != nil && p.paramBinding.NumRows() > 0) || (p.streamBinding != nil) } -func (p *PreparedStatement) writeBindParameters(pstream pb.FlightService_DoPutClient, desc *pb.FlightDescriptor) (*flight.Writer, error) { +func (p *PreparedStatement) bindParameters(ctx context.Context, desc *pb.FlightDescriptor, opts ...grpc.CallOption) (*flight.FlightDescriptor, error) { + if p.hasBindParameters() { + pstream, err := p.client.Client.DoPut(ctx, opts...) + if err != nil { + return nil, err + } + wr, err := p.writeBindParametersToStream(pstream, desc) + if err != nil { + return nil, err + } + if err = wr.Close(); err != nil { + return nil, err + } + pstream.CloseSend() + if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { + return nil, err + } + + cmd := pb.CommandPreparedStatementQuery{PreparedStatementHandle: p.handle} + desc, err = descForCommand(&cmd) + if err != nil { + return nil, err + } + return desc, nil + } + return desc, nil +} + +// XXX: this does not capture the updated handle. Prefer bindParameters. +func (p *PreparedStatement) writeBindParametersToStream(pstream pb.FlightService_DoPutClient, desc *pb.FlightDescriptor) (*flight.Writer, error) { if p.paramBinding != nil { wr := flight.NewRecordWriter(pstream, ipc.WithSchema(p.paramBinding.Schema())) wr.SetFlightDescriptor(desc) diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index 727fe02aa7063..33da79167c4ae 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -448,9 +448,9 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)}) // mocked DoPut result - doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(updatedHandle)} + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(updatedHandle)} resdata, _ := proto.Marshal(doPutPreparedStatementResult) - putResult := &pb.PutResult{ AppMetadata: resdata } + putResult := &pb.PutResult{AppMetadata: resdata} // mocked client stream for DoPut mockedPut := &mockDoPutClient{} @@ -461,7 +461,7 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { mockedPut.On("CloseSend").Return(nil) mockedPut.On("Recv").Return(putResult, nil) - infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)} + infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(updatedHandle)} desc := getDesc(infoCmd) s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil) @@ -525,9 +525,9 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteReaderBinding() { expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)}) // mocked DoPut result - doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(query)} + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(query)} resdata, _ := proto.Marshal(doPutPreparedStatementResult) - putResult := &pb.PutResult{ AppMetadata: resdata } + putResult := &pb.PutResult{AppMetadata: resdata} // mocked client stream for DoPut mockedPut := &mockDoPutClient{} From 6dc662324c2c46a7b5066b91cd0ace93a275ecf7 Mon Sep 17 00:00:00 2001 From: DenisTarasyuk <131180287+DenisTarasyuk@users.noreply.github.com> Date: Tue, 30 Apr 2024 03:59:51 +0300 Subject: [PATCH 006/105] GH-41433: [C++][Gandiva] Fix ascii_utf8 function to return same result on x86 and Arm (#41434) ### Rationale for this change Fixing ascii_utf8 function that has different return result on x86 and Arm due to default char type sign difference on those platforms. Added tests to cover existing x86 behavior for ascii symbols with code >127. ### What changes are included in this PR? 1. Added type cast to signed char to save existing x86 behavior on Arm platform. 2. Added tests cases for negative results. ### Are these changes tested? UT included. ### Are there any user-facing changes? None * GitHub Issue: #41433 Authored-by: DenisTarasyuk Signed-off-by: Sutou Kouhei --- cpp/src/gandiva/precompiled/string_ops.cc | 2 +- cpp/src/gandiva/precompiled/string_ops_test.cc | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 5aa0eb38eafd7..3849cf7bdf9a5 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1377,7 +1377,7 @@ gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) { if (data_len == 0) { return 0; } - return static_cast(data[0]); + return static_cast(static_cast(data[0])); } // Returns the ASCII character having the binary equivalent to A. diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 89213592e7ea2..aaa25db0a9f8d 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -51,6 +51,8 @@ TEST(TestStringOps, TestAscii) { EXPECT_EQ(ascii_utf8("", 0), 0); EXPECT_EQ(ascii_utf8("123", 3), 49); EXPECT_EQ(ascii_utf8("999", 3), 57); + EXPECT_EQ(ascii_utf8("\x80", 1), -128); + EXPECT_EQ(ascii_utf8("\xFF", 1), -1); } TEST(TestStringOps, TestChrBigInt) { From 747c8a28306f1e14439cf374b04cb8ed68e08cd2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 19:29:36 -0700 Subject: [PATCH 007/105] MINOR: [C#] Bump xunit.runner.visualstudio from 2.5.8 to 2.8.0 in /csharp (#41441) Bumps [xunit.runner.visualstudio](https://github.com/xunit/visualstudio.xunit) from 2.5.8 to 2.8.0.
Commits
  • 6438bb8 v2.8.0
  • 2afd4cd Pick up latest dependencies
  • b8be108 Add multiplier format support to RunSettings
  • 3c2e493 Update to 2.7.2-pre.17 and support Xunit.ParallelAlgorithm in RunSetttings
  • 144931e Missing height on version
  • 4315921 Fix concurrency bug in AssemblyHelper (#407)
  • 8617393 Bump up to 2.5.9-pre
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit.runner.visualstudio&package-manager=nuget&previous-version=2.5.8&new-version=2.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index b386ccf79c12c..df53da2098509 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -9,7 +9,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index ae6f9f1e69667..65b4ac027e29f 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index ed158ca8656d3..cde2004e8e48d 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 06fb44e0a0e88..491a0c087b1cd 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -18,7 +18,7 @@ - + all runtime; build; native; contentfiles; analyzers From 131dbd60b52d595583aae3c883fbddce26199d68 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 19:37:41 -0700 Subject: [PATCH 008/105] MINOR: [C#] Bump ZstdSharp.Port and System.Runtime.CompilerServices.Unsafe in /csharp (#41440) Bumps [ZstdSharp.Port](https://github.com/oleg-st/ZstdSharp) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `ZstdSharp.Port` from 0.7.6 to 0.8.0
Release notes

Sourced from ZstdSharp.Port's releases.

0.8.0

Ported zstd v1.5.6 Workaround for .NET Native

Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression/Apache.Arrow.Compression.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 0ce8c89bb1d1b..c34d880f90060 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -13,7 +13,7 @@ - + From de37ee88690fc2ca8e48341d59e7dba327d8fe2c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 20:12:54 -0700 Subject: [PATCH 009/105] MINOR: [C#] Bump xunit from 2.7.1 to 2.8.0 in /csharp (#41439) Bumps [xunit](https://github.com/xunit/xunit) from 2.7.1 to 2.8.0.
Commits
  • be260b3 v2.8.0
  • a8ceb66 #783: Add -useansicolor flag to console runner (v2)
  • 7b0ff93 Don't show /aggressive with unlimited threads
  • 46cdf06 Support parallel algorithm in MSBuild runner
  • b4aa876 Support multipler syntax in MSBuild runner
  • 6790b48 Add aggressive display to TestFrameworkEnvironment reported by XunitTestAssem...
  • 3dd7e91 Update mocks to make CollectionBehaviorAttribute property values optional
  • 4c82dea Asking for default threads should set 0, not null
  • d73cdef Should not try to use a semaphore when we've been asked for unlimited threads
  • 3722e54 Enable multiplier style max threads support
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit&package-manager=nuget&previous-version=2.7.1&new-version=2.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index df53da2098509..2b1720561004e 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 65b4ac027e29f..c8fb40f2d6702 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index cde2004e8e48d..ba60451f25f68 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 491a0c087b1cd..90b498d4e9b03 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -17,7 +17,7 @@ - + all runtime; build; native; contentfiles; analyzers From e4f31462dbd668c3bcb6ce96442f3c1632c4d8c8 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 30 Apr 2024 06:38:40 +0200 Subject: [PATCH 010/105] GH-41317: [C++] Fix crash on invalid Parquet file (#41366) ### Rationale for this change Fixes the crash detailed in #41317 in TableBatchReader::ReadNext() on a corrupted Parquet file ### What changes are included in this PR? Add a validation that all read columns have the same size ### Are these changes tested? I've tested on the reproducer I provided in #41317 that it now triggers a clean error: ``` Traceback (most recent call last): File "test.py", line 3, in [_ for _ in parquet_file.iter_batches()] File "test.py", line 3, in [_ for _ in parquet_file.iter_batches()] File "pyarrow/_parquet.pyx", line 1587, in iter_batches File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowInvalid: columns do not have the same size ``` I'm not sure if/how unit tests for corrupted datasets should be added ### Are there any user-facing changes? No **This PR contains a "Critical Fix".** * GitHub Issue: #41317 Authored-by: Even Rouault Signed-off-by: mwish --- cpp/src/arrow/table.cc | 2 ++ cpp/src/arrow/table.h | 2 ++ cpp/src/parquet/arrow/reader.cc | 10 ++++++++++ 3 files changed, 14 insertions(+) diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 967e78f6b4db1..5dc5e4c1a9a8c 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -619,6 +619,7 @@ TableBatchReader::TableBatchReader(const Table& table) for (int i = 0; i < table.num_columns(); ++i) { column_data_[i] = table.column(i).get(); } + DCHECK(table_.Validate().ok()); } TableBatchReader::TableBatchReader(std::shared_ptr table) @@ -632,6 +633,7 @@ TableBatchReader::TableBatchReader(std::shared_ptr
table) for (int i = 0; i < owned_table_->num_columns(); ++i) { column_data_[i] = owned_table_->column(i).get(); } + DCHECK(table_.Validate().ok()); } std::shared_ptr TableBatchReader::schema() const { return table_.schema(); } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index a7508430c132b..79675fa92b1f3 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -241,6 +241,8 @@ class ARROW_EXPORT Table { /// /// The conversion is zero-copy: each record batch is a view over a slice /// of the table's columns. +/// +/// The table is expected to be valid prior to using it with the batch reader. class ARROW_EXPORT TableBatchReader : public RecordBatchReader { public: /// \brief Construct a TableBatchReader for the given table diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index d6ad7c25bc7c1..285e2a597389d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1043,6 +1043,16 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, } } + // Check all columns has same row-size + if (!columns.empty()) { + int64_t row_size = columns[0]->length(); + for (size_t i = 1; i < columns.size(); ++i) { + if (columns[i]->length() != row_size) { + return ::arrow::Status::Invalid("columns do not have the same size"); + } + } + } + auto table = ::arrow::Table::Make(batch_schema, std::move(columns)); auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table); From 97e169a115bcf4e18fffd6c788f6fde648969664 Mon Sep 17 00:00:00 2001 From: Calvin Kirs Date: Tue, 30 Apr 2024 17:00:02 +0800 Subject: [PATCH 011/105] MINOR: [Java] Upgrade bcpkix-jdkon15 dependency to bcpkix-jdkon18 (#41458) ### Rationale for this change Since bcpkix-jdk15on is no longer being maintained and bcpkix-jdkon18 is fully compatible with it, we can replace bcpkix-jdk15on with bcpkix-jdkon18. This will ensure continued support and security for our applications. FYI: https://www.bouncycastle.org/latest_releases.html ### What changes are included in this PR? - Upgrade bcpkix-jdkon15 dependency to bcpkix-jdkon18 ### Are these changes tested? - yes We used the JcaPEMWriter class to convert certificates stored within a KeyStore object into PEM format and subsequently write them to a designated JcaPEMWriter object. Existing test suites provide comprehensive coverage for this functionality. Authored-by: Calvin Kirs Signed-off-by: David Li --- java/flight/flight-sql-jdbc-core/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/pom.xml b/java/flight/flight-sql-jdbc-core/pom.xml index 2e0de90fcf8bc..ef3f2469b73dd 100644 --- a/java/flight/flight-sql-jdbc-core/pom.xml +++ b/java/flight/flight-sql-jdbc-core/pom.xml @@ -126,8 +126,8 @@ org.bouncycastle - bcpkix-jdk15on - 1.70 + bcpkix-jdk18on + 1.78.1 From b609de374c7c00e1537eb8092e1ff2db718d2b61 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Apr 2024 13:42:31 +0200 Subject: [PATCH 012/105] GH-40342: [Python] Fix pickling of LocalFileSystem for cython 2 (#41459) Small follow-up fix for the failure introduced by https://github.com/apache/arrow/pull/40356 * GitHub Issue: #40342 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/_fs.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 0e635b2c8a28a..dbfb6ed114553 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -18,6 +18,7 @@ # cython: language_level = 3 from cpython.datetime cimport datetime, PyDateTime_DateTime +from cython cimport binding from pyarrow.includes.common cimport * from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint @@ -421,6 +422,7 @@ cdef class FileSystem(_Weakrefable): "SubTreeFileSystem") @staticmethod + @binding(True) # Required for cython < 3 def _from_uri(uri): fs, _path = FileSystem.from_uri(uri) return fs From e22197f39e41446789dcc52e931995fe20a784a4 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Tue, 30 Apr 2024 09:41:44 -0400 Subject: [PATCH 013/105] GH-41400: [MATLAB] Bump `libmexclass` version to commit `ca3cea6` (#41436) ### Rationale for this change @ kevingurney and I recently resolved multiple issues related to `mathworks/libmexclass` not supporting ARM-based macOS builds (i.e. builds on `macos-14`): - mathworks/libmexclass#76 - mathworks/libmexclass#77 We should bump the version of mathworks/libmexclass used by the MATLAB interface to the latest available commit ([ca3cea6](https://github.com/mathworks/libmexclass/commit/ca3cea6bf1ba5e9d86210bd207d643493e8d45f6) as of now) in order to enable building the MATLAB interface to Arrow on `macos-14` (which is ARM-based). ### What changes are included in this PR? - Bumped version of `mathworks/libmexclass` used by the MATLAB interface to [ca3cea6](https://github.com/mathworks/libmexclass/commit/ca3cea6bf1ba5e9d86210bd207d643493e8d45f6) ### Are these changes tested? - Yes. The existing test points verify verify upgrading `mathworks/libmexclass` does not break the MATLAB interface. ### Are there any user-facing changes? - No. ### Future Directions - #41435 - #41385 * GitHub Issue: #41400 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- matlab/tools/cmake/BuildMatlabArrowInterface.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index cb746e08b1f8e..e1641842ca8b9 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -24,8 +24,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_NAME libmexclass) # libmexclass is accessible for CI without permission issues. set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_REPOSITORY "https://github.com/mathworks/libmexclass.git") # Use a specific Git commit hash to avoid libmexclass version changing unexpectedly. -set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "d04f88d") - +set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "ca3cea6") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_SOURCE_SUBDIR "libmexclass/cpp") # ------------------------------------------ From 0ef7351986ee8b967e210d0f9c7a9c8e4d4038fd Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 1 May 2024 02:01:39 +0800 Subject: [PATCH 014/105] GH-41407: [C++] Use static method to fill scalar scratch space to prevent ub (#41421) ### Rationale for this change In #40237, I introduced scalar scratch space filling in concrete scalar sub-class constructor, in which there is a static down-casting of `this` to sub-class pointer. Though this is common in CRTP, it happens in base cast constructor. And this is reported in #41407 to be UB by UBSAN's "vptr" sanitizing. I'm not a language lawyer to tell if this is a true/false-positive. So I proposed two approaches: 1. The easy way: add suppression in [1], like we already did for `shared_ptr`. But apparently this won't be feasible if this is a true-positive (need some language lawyer's help to confirm). 2. The hard way: totally avoid this so-to-speak UB but may introduce more boilerplate code. This PR is the hard way. [1] https://github.com/apache/arrow/blob/main/r/tools/ubsan.supp ### What changes are included in this PR? Make `FillScratchSpace` static. ### Are these changes tested? The existing UT should cover it well. ### Are there any user-facing changes? None. * GitHub Issue: #41407 Lead-authored-by: Ruoxi Sun Co-authored-by: Rossi Sun Co-authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/scalar.cc | 73 +++++++++++++++----------- cpp/src/arrow/scalar.h | 112 +++++++++++++++++++++++++++++++--------- 2 files changed, 130 insertions(+), 55 deletions(-) diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 8e8d3903663e4..7d8084e17c279 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -563,15 +563,17 @@ Status Scalar::ValidateFull() const { BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} -void BinaryScalar::FillScratchSpace() { +void BinaryScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->size()) : int32_t(0)}); } -void BinaryViewScalar::FillScratchSpace() { +void BinaryViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { static_assert(sizeof(BinaryViewType::c_type) <= internal::kScalarScratchSpaceSize); - auto* view = new (&scratch_space_) BinaryViewType::c_type; + auto* view = new (scratch_space) BinaryViewType::c_type; if (value) { *view = util::ToBinaryView(std::string_view{*value}, 0, 0); } else { @@ -579,9 +581,10 @@ void BinaryViewScalar::FillScratchSpace() { } } -void LargeBinaryScalar::FillScratchSpace() { +void LargeBinaryScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int64_t(0), value ? static_cast(value->size()) : int64_t(0)}); } @@ -612,36 +615,40 @@ BaseListScalar::BaseListScalar(std::shared_ptr value, } ListScalar::ListScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, list(value->type()), is_valid) {} + : ListScalar(value, list(value->type()), is_valid) {} -void ListScalar::FillScratchSpace() { +void ListScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, large_list(value->type()), is_valid) {} + : LargeListScalar(value, large_list(value->type()), is_valid) {} -void LargeListScalar::FillScratchSpace() { - FillScalarScratchSpace(scratch_space_, +void LargeListScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { + FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, list_view(value->type()), is_valid) {} + : ListViewScalar(value, list_view(value->type()), is_valid) {} -void ListViewScalar::FillScratchSpace() { +void ListViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, large_list_view(value->type()), is_valid) {} + : LargeListViewScalar(value, large_list_view(value->type()), is_valid) {} -void LargeListViewScalar::FillScratchSpace() { - FillScalarScratchSpace(scratch_space_, +void LargeListViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { + FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } @@ -652,11 +659,12 @@ inline std::shared_ptr MakeMapType(const std::shared_ptr& pa } MapScalar::MapScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, MakeMapType(value->type()), is_valid) {} + : MapScalar(value, MakeMapType(value->type()), is_valid) {} -void MapScalar::FillScratchSpace() { +void MapScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } @@ -705,7 +713,9 @@ Result> StructScalar::field(FieldRef ref) const { RunEndEncodedScalar::RunEndEncodedScalar(std::shared_ptr value, std::shared_ptr type) - : Scalar{std::move(type), value->is_valid}, value{std::move(value)} { + : Scalar{std::move(type), value->is_valid}, + ArraySpanFillFromScalarScratchSpace(*this->type), + value{std::move(value)} { ARROW_CHECK_EQ(this->type->id(), Type::RUN_END_ENCODED); } @@ -716,18 +726,18 @@ RunEndEncodedScalar::RunEndEncodedScalar(const std::shared_ptr& type) RunEndEncodedScalar::~RunEndEncodedScalar() = default; -void RunEndEncodedScalar::FillScratchSpace() { - auto run_end = run_end_type()->id(); +void RunEndEncodedScalar::FillScratchSpace(uint8_t* scratch_space, const DataType& type) { + Type::type run_end = checked_cast(type).run_end_type()->id(); switch (run_end) { case Type::INT16: - FillScalarScratchSpace(scratch_space_, {int16_t(1)}); + FillScalarScratchSpace(scratch_space, {int16_t(1)}); break; case Type::INT32: - FillScalarScratchSpace(scratch_space_, {int32_t(1)}); + FillScalarScratchSpace(scratch_space, {int32_t(1)}); break; default: DCHECK_EQ(run_end, Type::INT64); - FillScalarScratchSpace(scratch_space_, {int64_t(1)}); + FillScalarScratchSpace(scratch_space, {int64_t(1)}); } } @@ -806,6 +816,7 @@ Result TimestampScalar::FromISO8601(std::string_view iso8601, SparseUnionScalar::SparseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, /*is_valid=*/true), + ArraySpanFillFromScalarScratchSpace(type_code), value(std::move(value)) { const auto child_ids = checked_cast(*this->type).child_ids(); if (type_code >= 0 && static_cast(type_code) < child_ids.size() && @@ -833,13 +844,13 @@ std::shared_ptr SparseUnionScalar::FromValue(std::shared_ptr val return std::make_shared(field_values, type_code, std::move(type)); } -void SparseUnionScalar::FillScratchSpace() { - auto* union_scratch_space = reinterpret_cast(&scratch_space_); +void SparseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { + auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; } -void DenseUnionScalar::FillScratchSpace() { - auto* union_scratch_space = reinterpret_cast(&scratch_space_); +void DenseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { + auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; FillScalarScratchSpace(union_scratch_space->offsets, {int32_t(0), int32_t(1)}); } diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index a7ee6a417d9a1..982a4c5113c92 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -141,7 +141,12 @@ struct ARROW_EXPORT ArraySpanFillFromScalarScratchSpace { alignas(int64_t) mutable uint8_t scratch_space_[kScalarScratchSpaceSize]; private: - ArraySpanFillFromScalarScratchSpace() { static_cast(this)->FillScratchSpace(); } + template + explicit ArraySpanFillFromScalarScratchSpace(Args&&... args) { + Impl::FillScratchSpace(scratch_space_, std::forward(args)...); + } + + ArraySpanFillFromScalarScratchSpace() = delete; friend Impl; }; @@ -278,20 +283,32 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit BinaryScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} + explicit BinaryScalar(std::string s) : BinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -312,23 +329,35 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryViewType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit BinaryViewScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryViewScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryViewScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit BinaryViewScalar(std::shared_ptr value) : BinaryViewScalar(std::move(value), binary_view()) {} explicit BinaryViewScalar(std::string s) - : BaseBinaryScalar(std::move(s), binary_view()) {} + : BinaryViewScalar(std::move(s), binary_view()) {} BinaryViewScalar() : BinaryViewScalar(binary_view()) {} std::string_view view() const override { return std::string_view(*this->value); } private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -350,24 +379,33 @@ struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit LargeBinaryScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + LargeBinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + LargeBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} explicit LargeBinaryScalar(std::string s) - : BaseBinaryScalar(std::move(s), large_binary()) {} + : LargeBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -550,14 +588,19 @@ struct ARROW_EXPORT ListScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + ListScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit ListScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -567,14 +610,19 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + LargeListScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -584,14 +632,19 @@ struct ARROW_EXPORT ListViewScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListViewType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + ListViewScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -601,14 +654,19 @@ struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListViewType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + LargeListViewScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -618,14 +676,19 @@ struct ARROW_EXPORT MapScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = MapType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + MapScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit MapScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -707,7 +770,7 @@ struct ARROW_EXPORT SparseUnionScalar std::shared_ptr type); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -733,10 +796,11 @@ struct ARROW_EXPORT DenseUnionScalar DenseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, value->is_valid), + ArraySpanFillFromScalarScratchSpace(type_code), value(std::move(value)) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -772,7 +836,7 @@ struct ARROW_EXPORT RunEndEncodedScalar private: const TypeClass& ree_type() const { return internal::checked_cast(*type); } - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, const DataType& type); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; From 5e986be59f08135d2fdaeb819c87120b0bf7436a Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Wed, 1 May 2024 06:20:04 +0800 Subject: [PATCH 015/105] GH-41183: [C++][Python] Expose recursive flatten for lists on list_flatten kernel function and pyarrow bindings (#41295) ### Rationale for this change Expose recursive flatten for logical lists on list_flatten kernel function and pyarrow bindings. ### What changes are included in this PR? 1. Expose recursive flatten for logical lists on `list_flatten` kernel function 2. Support [Large]ListView for some kernel functions: `list_flatten`,`list_value_length`, `list_element` 3. Support recursive flatten for pyarrow bindinds and simplify [Large]ListView's pyarrow bindings 4. Refactor vector_nested_test.cc for better support [Large]ListView types. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes. 1. Some kernel functions like: list_flatten, list_value_length, list_element would support [Large]ListView types 2. `list_flatten` and related pyarrow bindings could support flatten recursively with an ListFlattenOptions. * GitHub Issue: #41183 Lead-authored-by: ZhangHuiGui Co-authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/api_vector.cc | 7 + cpp/src/arrow/compute/api_vector.h | 12 + .../arrow/compute/kernels/codegen_internal.cc | 21 +- .../arrow/compute/kernels/codegen_internal.h | 3 +- .../arrow/compute/kernels/scalar_nested.cc | 49 +++- .../compute/kernels/scalar_nested_test.cc | 17 +- .../arrow/compute/kernels/vector_nested.cc | 54 +++-- .../compute/kernels/vector_nested_test.cc | 129 +++++++++-- python/pyarrow/_compute.pyx | 20 ++ python/pyarrow/array.pxi | 215 +++++++----------- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 + python/pyarrow/lib.pxd | 4 +- python/pyarrow/tests/test_array.py | 8 +- python/pyarrow/tests/test_compute.py | 1 + 15 files changed, 364 insertions(+), 182 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d47ee42ebf239..f0d5c0fcc3d72 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -153,6 +153,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); +static auto kListFlattenOptionsType = GetFunctionOptionsType( + DataMember("recursive", &ListFlattenOptions::recursive)); } // namespace } // namespace internal @@ -224,6 +226,10 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; +ListFlattenOptions::ListFlattenOptions(bool recursive) + : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} +constexpr char ListFlattenOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -237,6 +243,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 919572f16ee69..e5bcc37329661 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -245,6 +245,18 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { int64_t periods = 1; }; +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursive = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// \brief If true, the list is flattened recursively until a non-list + /// array is formed. + bool recursive = false; +}; + /// @} /// \brief Filter with a boolean selection filter diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 00a833742f957..0fd9cae7a8d71 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/compute/api_vector.h" #include "arrow/type_fwd.h" namespace arrow { @@ -56,9 +57,23 @@ Result LastType(KernelContext*, const std::vector& types return types.back(); } -Result ListValuesType(KernelContext*, const std::vector& args) { - const auto& list_type = checked_cast(*args[0].type); - return list_type.value_type().get(); +Result ListValuesType(KernelContext* ctx, + const std::vector& args) { + auto list_type = checked_cast(args[0].type); + auto value_type = list_type->value_type().get(); + + auto recursive = + ctx->state() ? OptionsWrapper::Get(ctx).recursive : false; + if (!recursive) { + return value_type; + } + + for (auto value_kind = value_type->id(); + is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { + list_type = checked_cast(list_type->value_type().get()); + value_type = list_type->value_type().get(); + } + return value_type; } void EnsureDictionaryDecoded(std::vector* types) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 097ee1de45b6a..9e46a21887f8c 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -423,7 +423,8 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar Result FirstType(KernelContext*, const std::vector& types); Result LastType(KernelContext*, const std::vector& types); -Result ListValuesType(KernelContext*, const std::vector& types); +Result ListValuesType(KernelContext* ctx, + const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 733ab9c0dc287..b99f065a0b158 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -23,6 +23,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" @@ -41,10 +42,17 @@ Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou const ArraySpan& arr = batch[0].array; ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - const offset_type* offsets = arr.GetValues(1); - // Offsets are always well-defined and monotonic, even for null values - for (int64_t i = 0; i < arr.length; ++i) { - *out_values++ = offsets[i + 1] - offsets[i]; + if (is_list_view(*arr.type)) { + const auto* sizes = arr.GetValues(2); + if (arr.length > 0) { + memcpy(out_values, sizes, arr.length * sizeof(offset_type)); + } + } else { + const offset_type* offsets = arr.GetValues(1); + // Offsets are always well-defined and monotonic, even for null values + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = offsets[i + 1] - offsets[i]; + } } return Status::OK(); } @@ -59,6 +67,30 @@ Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch, return Status::OK(); } +template +void AddListValueLengthKernel(ScalarFunction* func, + const std::shared_ptr& out_type) { + auto in_type = {InputType(InListType::type_id)}; + ScalarKernel kernel(in_type, out_type, ListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +template <> +void AddListValueLengthKernel( + ScalarFunction* func, const std::shared_ptr& out_type) { + auto in_type = {InputType(Type::FIXED_SIZE_LIST)}; + ScalarKernel kernel(in_type, out_type, FixedSizeListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddListValueLengthKernels(ScalarFunction* func) { + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); +} + const FunctionDoc list_value_length_doc{ "Compute list lengths", ("`lists` must have a list-like type.\n" @@ -399,6 +431,8 @@ void AddListElementKernels(ScalarFunction* func) { void AddListElementKernels(ScalarFunction* func) { AddListElementKernels(func); AddListElementKernels(func); + AddListElementKernels(func); + AddListElementKernels(func); AddListElementKernels(func); } @@ -824,12 +858,7 @@ const FunctionDoc map_lookup_doc{ void RegisterScalarNested(FunctionRegistry* registry) { auto list_value_length = std::make_shared( "list_value_length", Arity::Unary(), list_value_length_doc); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLength)); + AddListValueLengthKernels(list_value_length.get()); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); auto list_element = diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b82..32bea8246954d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -30,11 +30,21 @@ namespace arrow { namespace compute { static std::shared_ptr GetOffsetType(const DataType& type) { - return type.id() == Type::LIST ? int32() : int64(); + switch (type.id()) { + case Type::LIST: + case Type::LIST_VIEW: + return int32(); + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return int64(); + default: + Unreachable("Unexpected type"); + } } TEST(TestScalarNested, ListValueLength) { - for (auto ty : {list(int32()), large_list(int32())}) { + for (auto ty : {list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32())}) { CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } @@ -47,7 +57,8 @@ TEST(TestScalarNested, ListValueLength) { TEST(TestScalarNested, ListElementNonFixedListWithNulls) { auto sample = "[[7, 5, 81], [6, null, 4, 7, 8], [3, 12, 2, 0], [1, 9], null]"; for (auto ty : NumericTypes()) { - for (auto list_type : {list(ty), large_list(ty)}) { + for (auto list_type : + {list(ty), large_list(ty), list_view(ty), large_list_view(ty)}) { auto input = ArrayFromJSON(list_type, sample); auto null_input = ArrayFromJSON(list_type, "[null]"); for (auto index_type : IntTypes()) { diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 08930e589f7b4..8c77c261c6a98 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -18,6 +18,7 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/visit_type_inline.h" @@ -29,8 +30,13 @@ namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto recursive = OptionsWrapper::Get(ctx).recursive; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool())); + + auto pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto result, (recursive ? list_array.FlattenRecursively(pool) + : list_array.Flatten(pool))); + out->value = std::move(result->data()); return Status::OK(); } @@ -107,10 +113,15 @@ struct ListParentIndicesArray { const FunctionDoc list_flatten_doc( "Flatten list values", - ("`lists` must have a list-like type.\n" - "Return an array with the top list level flattened.\n" - "Top-level null values in `lists` do not emit anything in the input."), - {"lists"}); + ("`lists` must have a list-like type (lists, list-views, and\n" + "fixed-size lists).\n" + "Return an array with the top list level flattened unless\n" + "`recursive` is set to true in ListFlattenOptions. When that\n" + "is that case, flattening happens recursively until a non-list\n" + "array is formed.\n" + "\n" + "Null list values do not emit anything to the output."), + {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", @@ -153,17 +164,34 @@ class ListParentIndicesFunction : public MetaFunction { } }; +const ListFlattenOptions* GetDefaultListFlattenOptions() { + static const auto kDefaultListFlattenOptions = ListFlattenOptions::Defaults(); + return &kDefaultListFlattenOptions; +} + +template +void AddBaseListFlattenKernels(VectorFunction* func) { + auto in_type = {InputType(InListType::type_id)}; + auto out_type = OutputType(ListValuesType); + VectorKernel kernel(in_type, out_type, ListFlatten, + OptionsWrapper::Init); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddBaseListFlattenKernels(VectorFunction* func) { + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); +} + } // namespace void RegisterVectorNested(FunctionRegistry* registry) { - auto flatten = - std::make_shared("list_flatten", Arity::Unary(), list_flatten_doc); - DCHECK_OK(flatten->AddKernel({Type::LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::FIXED_SIZE_LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::LARGE_LIST}, OutputType(ListValuesType), - ListFlatten)); + auto flatten = std::make_shared( + "list_flatten", Arity::Unary(), list_flatten_doc, GetDefaultListFlattenOptions()); + AddBaseListFlattenKernels(flatten.get()); DCHECK_OK(registry->AddFunction(std::move(flatten))); DCHECK_OK(registry->AddFunction(std::make_shared())); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index eef1b6835ffb5..56604ebd16cc0 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -19,6 +19,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" @@ -29,38 +30,113 @@ namespace compute { using arrow::internal::checked_cast; -TEST(TestVectorNested, ListFlatten) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); - auto expected = ArrayFromJSON(int16(), "[0, null, 1, 2, 3]"); +using ListAndListViewTypes = + ::testing::Types; + +// ---------------------------------------------------------------------- +// [Large]List and [Large]ListView tests +template +class TestVectorNestedSpecialized : public ::testing::Test { + public: + using TypeClass = T; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + } + + public: + void TestListFlatten() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(value_type_, "[0, null, 1, 2, 3]"); CheckVectorUnary("list_flatten", input, expected); // Construct a list with a non-empty null slot auto tweaked = TweakValidityBit(input, 0, false); - expected = ArrayFromJSON(int16(), "[2, 3]"); + expected = ArrayFromJSON(value_type_, "[2, 3]"); CheckVectorUnary("list_flatten", tweaked, expected); } -} -TEST(TestVectorNested, ListFlattenNulls) { - const auto ty = list(int32()); - auto input = ArrayFromJSON(ty, "[null, null]"); - auto expected = ArrayFromJSON(int32(), "[]"); - CheckVectorUnary("list_flatten", input, expected); -} + void TestListFlattenNulls() { + value_type_ = int32(); + type_ = std::make_shared(value_type_); + auto input = ArrayFromJSON(type_, "[null, null]"); + auto expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected); + } -TEST(TestVectorNested, ListFlattenChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - ARROW_SCOPED_TRACE(ty->ToString()); - auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); - auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); + void TestListFlattenChunkedArray() { + ARROW_SCOPED_TRACE(type_->ToString()); + auto input = ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], []]"}); + auto expected = ChunkedArrayFromJSON(value_type_, {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); ARROW_SCOPED_TRACE("empty"); - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int16(), {}); + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(value_type_, {}); CheckVectorUnary("list_flatten", input, expected); } + + void TestListFlattenRecursively() { + auto inner_type = std::make_shared(value_type_); + type_ = std::make_shared(inner_type); + + ListFlattenOptions opts; + opts.recursive = true; + + // List types with two nesting levels: list> + auto input = ArrayFromJSON(type_, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])"); + auto expected = ArrayFromJSON(value_type_, "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // Empty nested list should flatten until non-list type is reached + input = ArrayFromJSON(type_, R"([null])"); + expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // List types with three nesting levels: list>> + type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); + input = ArrayFromJSON(type_, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])"); + expected = ArrayFromJSON(value_type_, "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + } + + protected: + std::shared_ptr type_; + std::shared_ptr value_type_; +}; + +TYPED_TEST_SUITE(TestVectorNestedSpecialized, ListAndListViewTypes); + +TYPED_TEST(TestVectorNestedSpecialized, ListFlatten) { this->TestListFlatten(); } + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenNulls) { + this->TestListFlattenNulls(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenChunkedArray) { + this->TestListFlattenChunkedArray(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenRecursively) { + this->TestListFlattenRecursively(); } TEST(TestVectorNested, ListFlattenFixedSizeList) { @@ -92,6 +168,21 @@ TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { CheckVectorUnary("list_flatten", input, expected); } +TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { + ListFlattenOptions opts; + opts.recursive = true; + + auto inner_type = fixed_size_list(int32(), 2); + auto type = fixed_size_list(inner_type, 2); + auto input = ArrayFromJSON(type, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])"); + auto expected = ArrayFromJSON(int32(), "[0, 1, null, 3, 7, null, 2, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); +} + TEST(TestVectorNested, ListParentIndices) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a267d53599436..44a3d5e740701 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2035,6 +2035,26 @@ class PairwiseOptions(_PairwiseOptions): self._set_options(period) +cdef class _ListFlattenOptions(FunctionOptions): + def _set_options(self, recursive): + self.wrapped.reset(new CListFlattenOptions(recursive)) + + +class ListFlattenOptions(_ListFlattenOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + + def __init__(self, recursive=False): + self._set_options(recursive) + + cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): self.wrapped.reset(new CArraySortOptions( diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 60fc09ea861b6..6a11b19ffcdf5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2141,22 +2141,99 @@ cdef class Decimal256Array(FixedSizeBinaryArray): cdef class BaseListArray(Array): - def flatten(self): + def flatten(self, recursive=False): """ - Unnest this ListArray/LargeListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursive'. Note that this method is different from ``self.values`` in that it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + recursive : bool, default False, optional + When True, flatten this logical list-array recursively until an + array of non-list values is formed. + + When False, flatten only the top level. + Returns ------- result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + When recursive=True, nested list arrays are flattened recursively + until an array of non-list values is formed. + + >>> array = pa.array([ + ... None, + ... [ + ... [1, None, 2], + ... None, + ... [3, 4] + ... ], + ... [], + ... [ + ... [], + ... [5, 6], + ... None + ... ], + ... [ + ... [7, 8] + ... ] + ... ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + null, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] """ - return _pc().list_flatten(self) + options = _pc().ListFlattenOptions(recursive) + return _pc().list_flatten(self, options=options) def value_parent_indices(self): """ @@ -2527,7 +2604,7 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a list view data type. """ @@ -2747,69 +2824,8 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this ListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - - -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a large list view data type. @@ -3037,67 +3053,6 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this LargeListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - cdef class MapArray(ListArray): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 205ab393b8b09..83612f66d21e2 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -44,6 +44,7 @@ IndexOptions, JoinOptions, ListSliceOptions, + ListFlattenOptions, MakeStructOptions, MapLookupOptions, MatchSubstringOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6dae45ab80b1c..f461513e8b3cf 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2589,6 +2589,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CPairwiseOptions(int64_t period) int64_t period + cdef cppclass CListFlattenOptions\ + "arrow::compute::ListFlattenOptions"(CFunctionOptions): + CListFlattenOptions(c_bool recursive) + c_bool recursive + cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): CArraySortOptions(CSortOrder, CNullPlacement) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index b1187a77c2a6e..bfd266a807c40 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -437,11 +437,11 @@ cdef class LargeListArray(BaseListArray): pass -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): pass -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): pass diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 156d58326b961..6a190957879d3 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2757,6 +2757,7 @@ def test_list_array_flatten(offset_type, list_type_factory): assert arr1.values.equals(arr0) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) @pytest.mark.parametrize('list_type', [ @@ -2778,7 +2779,9 @@ def test_list_value_parent_indices(list_type): @pytest.mark.parametrize(('offset_type', 'list_type'), [(pa.int32(), pa.list_(pa.int32())), (pa.int32(), pa.list_(pa.int32(), list_size=2)), - (pa.int64(), pa.large_list(pa.int32()))]) + (pa.int64(), pa.large_list(pa.int32())), + (pa.int32(), pa.list_view(pa.int32())), + (pa.int64(), pa.large_list_view(pa.int32()))]) def test_list_value_lengths(offset_type, list_type): # FixedSizeListArray needs fixed list sizes @@ -2876,6 +2879,8 @@ def test_fixed_size_list_array_flatten(): assert arr0.type.equals(typ0) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0) + assert arr2.flatten().equals(arr1) + assert arr2.flatten(True).equals(arr0) def test_fixed_size_list_array_flatten_with_slice(): @@ -3844,6 +3849,7 @@ def test_list_view_flatten(list_array_type, list_type_factory, offset_type): assert arr2.values.equals(arr1) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) # test out of order offsets values = [1, 2, 3, 4] diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 98cbd920b509b..17cc546f834ca 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -152,6 +152,7 @@ def test_option_class_equality(): pc.IndexOptions(pa.scalar(1)), pc.JoinOptions(), pc.ListSliceOptions(0, -1, 1, True), + pc.ListFlattenOptions(recursive=False), pc.MakeStructOptions(["field", "names"], field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), From 0d7fac0d49eae7f139735c3e7c9256fc304a698a Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Wed, 1 May 2024 06:26:05 +0800 Subject: [PATCH 016/105] GH-41418: [C++] Add [Large]ListView and Map nested types for scalar_if_else's kernel functions (#41419) ### Rationale for this change Add [Large]ListView and Map nested types for scalar_if_else's kernel functions ### What changes are included in this PR? 1. Add the list-view related types to `case_when`, `coalesce`'s kernel function and move the nested-types's added logic to a unified function for better management. 2. Add the `MapType` and related test for `if_else` ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41418 Authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Felipe Oliveira Carvalho --- .../arrow/compute/kernels/scalar_if_else.cc | 107 ++++++++++++++---- .../kernels/scalar_if_else_benchmark.cc | 50 +++++--- .../compute/kernels/scalar_if_else_test.cc | 19 +++- 3 files changed, 138 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index ee181c053c053..13874d9d65e70 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1309,9 +1309,10 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, - Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : + {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, Type::LARGE_LIST_VIEW, + Type::FIXED_SIZE_LIST, Type::MAP, Type::STRUCT, Type::DENSE_UNION, + Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; @@ -1847,6 +1848,48 @@ struct CaseWhenFunctor> { } }; +// TODO(GH-41453): a more efficient implementation for list-views is possible +template +struct CaseWhenFunctor> { + using offset_type = typename Type::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + /// TODO(wesm): should this be a DCHECK? Or checked elsewhere + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecVarWidthScalarCaseWhen(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [&](ArrayBuilder* raw_builder) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + + int64_t reservation = 0; + for (int arg = 1; arg < batch.num_values(); arg++) { + const ExecValue& source = batch[arg]; + if (!source.is_array()) { + const auto& scalar = checked_cast(*source.scalar); + if (!scalar.value) continue; + reservation = + std::max(reservation, batch.length * scalar.value->length()); + } else { + const ArraySpan& array = source.array; + reservation = std::max(reservation, array.child_data[0].length); + } + } + return child_builder->Reserve(reservation); + }); + } +}; + // No-op reserve function, pulled out to avoid apparent miscompilation on MinGW Status ReserveNoData(ArrayBuilder*) { return Status::OK(); } @@ -2712,6 +2755,25 @@ void AddBinaryCaseWhenKernels(const std::shared_ptr& scalar_fu } } +template +void AddNestedCaseWhenKernel(const std::shared_ptr& scalar_function) { + AddCaseWhenKernel(scalar_function, ArrowNestedType::type_id, + CaseWhenFunctor::Exec); +} + +void AddNestedCaseWhenKernels(const std::shared_ptr& scalar_function) { + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); +} + void AddCoalesceKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, FirstType, @@ -2731,6 +2793,25 @@ void AddPrimitiveCoalesceKernels(const std::shared_ptr& scalar_f } } +template +void AddNestedCoalesceKernel(const std::shared_ptr& scalar_function) { + AddCoalesceKernel(scalar_function, ArrowNestedType::type_id, + CoalesceFunctor::Exec); +} + +void AddNestedCoalesceKernels(const std::shared_ptr& scalar_function) { + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); +} + void AddChooseKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({Type::INT64, InputType(get_id.id)}, LastType, @@ -2822,15 +2903,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); - AddCaseWhenKernel(func, Type::FIXED_SIZE_LIST, - CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::STRUCT, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DENSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::SPARSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DICTIONARY, CaseWhenFunctor::Exec); + AddNestedCaseWhenKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { @@ -2848,15 +2921,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { for (const auto& ty : BaseBinaryTypes()) { AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase(ty)); } - AddCoalesceKernel(func, Type::FIXED_SIZE_LIST, - CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LARGE_LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::MAP, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::STRUCT, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DENSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::SPARSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DICTIONARY, CoalesceFunctor::Exec); + AddNestedCoalesceKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 58bc560f52842..5988908853d50 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -284,8 +284,11 @@ static void CaseWhenBench(benchmark::State& state) { state.SetItemsProcessed(state.iterations() * (len - offset)); } -static void CaseWhenBenchList(benchmark::State& state) { - auto type = list(int64()); +template +static void CaseWhenBenchList(benchmark::State& state, + const std::shared_ptr& type) { + using ArrayType = typename TypeTraits::ArrayType; + auto fld = field("", type); int64_t len = state.range(0); @@ -295,17 +298,17 @@ static void CaseWhenBenchList(benchmark::State& state) { auto cond_field = field("cond", boolean(), key_value_metadata({{"null_probability", "0.01"}})); - auto cond = rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), - key_value_metadata({{"null_probability", "0.0"}})), - len); - auto val1 = rand.ArrayOf(*fld, len); - auto val2 = rand.ArrayOf(*fld, len); - auto val3 = rand.ArrayOf(*fld, len); - auto val4 = rand.ArrayOf(*fld, len); + auto cond = std::static_pointer_cast( + rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), + key_value_metadata({{"null_probability", "0.0"}})), + len)) + ->Slice(offset); + auto val1 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val2 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val3 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val4 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); for (auto _ : state) { - ABORT_NOT_OK( - CaseWhen(cond->Slice(offset), {val1->Slice(offset), val2->Slice(offset), - val3->Slice(offset), val4->Slice(offset)})); + ABORT_NOT_OK(CaseWhen(cond, {val1, val2, val3, val4})); } // Set bytes processed to ~length of output @@ -372,6 +375,21 @@ static void CaseWhenBenchStringContiguous(benchmark::State& state) { return CaseWhenBenchContiguous(state); } +template +static void CaseWhenBenchVarLengthListLike(benchmark::State& state) { + auto value_type = TypeTraits::type_singleton(); + auto list_type = std::make_shared(value_type); + return CaseWhenBenchList(state, list_type); +} + +static void CaseWhenBenchListInt64(benchmark::State& state) { + return CaseWhenBenchVarLengthListLike(state); +} + +static void CaseWhenBenchListViewInt64(benchmark::State& state) { + CaseWhenBenchVarLengthListLike(state); +} + struct CoalesceParams { int64_t length; int64_t num_arguments; @@ -533,9 +551,11 @@ BENCHMARK(CaseWhenBench64)->Args({kNumItems, 99}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 0}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 99}); -// CaseWhen: Lists -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 0}); -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 99}); +// CaseWhen: List-like types +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 99}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 99}); // CaseWhen: Strings BENCHMARK(CaseWhenBenchString)->Args({kFewItems, 0}); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index c4c46b5efe84d..9a0ca325277dc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -896,6 +896,21 @@ TEST_F(TestIfElseKernel, ParameterizedTypes) { {cond, ArrayFromJSON(type0, "[0]"), ArrayFromJSON(type1, "[1]")})); } +TEST_F(TestIfElseKernel, MapNested) { + auto type = map(int64(), utf8()); + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[true, true, false, false]"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[3, "test"]], []])"), + ArrayFromJSON(type, R"([[[1, "b"]], [[2, "c"]], [[7, "abc"]], null])"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[7, "abc"]], null])")); + + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[null, null, null, null]"), + ArrayFromJSON(type, R"([null, [[1, "c"]], [[4, null]], [[6, "ok"]]])"), + ArrayFromJSON(type, R"([[[-1, null]], [[3, "c"]], null, [[6, "ok"]]])"), + ArrayFromJSON(type, R"([null, null, null, null])")); +} + template class TestIfElseUnion : public ::testing::Test {}; @@ -1920,7 +1935,7 @@ TYPED_TEST(TestCaseWhenBinary, Random) { template class TestCaseWhenList : public ::testing::Test {}; -TYPED_TEST_SUITE(TestCaseWhenList, ListArrowTypes); +TYPED_TEST_SUITE(TestCaseWhenList, ListAndListViewArrowTypes); TYPED_TEST(TestCaseWhenList, ListOfString) { auto type = std::make_shared(utf8()); @@ -2555,7 +2570,7 @@ class TestCoalesceList : public ::testing::Test {}; TYPED_TEST_SUITE(TestCoalesceNumeric, IfElseNumericBasedTypes); TYPED_TEST_SUITE(TestCoalesceBinary, BaseBinaryArrowTypes); -TYPED_TEST_SUITE(TestCoalesceList, ListArrowTypes); +TYPED_TEST_SUITE(TestCoalesceList, ListAndListViewArrowTypes); TYPED_TEST(TestCoalesceNumeric, Basics) { auto type = default_type_instance(); From 6b278be178975fe7174b961a3bf33502acb79295 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 30 Apr 2024 16:50:14 -0700 Subject: [PATCH 017/105] GH-41471: [Java] Fix performance uber-jar (#41473) ### Rationale for this change Performance `benchmarks.jar` uber-jar is mostly empty and is missing critical metadata information which would allow someone to run performance benchmarks using a simple commandline like: ``` $ java -cp performance/target/benchmarks.jar ArrowBufBenchmarks ``` ### What changes are included in this PR? Move benchmark classes from `src/test/java` to `src/main/java` and change the dependencies' scope as well so that `maven-shade-plugin` can actually pick up the classes to package. Also add missing jmh annotation generator to `maven-compiler-plugin` so that JMH metadata can be generated ### Are these changes tested? Local testing only. ### Are there any user-facing changes? I didn't find any user-facing documentation regarding JMH benchmarks. If there are some, it may be helpful to include a simplified command line * GitHub Issue: #41471 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/performance/pom.xml | 30 +++++++++---------- .../arrow/adapter/AvroAdapterBenchmarks.java | 0 .../adapter/jdbc/JdbcAdapterBenchmarks.java | 2 ++ .../search/ParallelSearcherBenchmarks.java | 2 ++ .../arrow/memory/AllocatorBenchmarks.java | 0 .../arrow/memory/ArrowBufBenchmarks.java | 0 .../util/ArrowBufPointerBenchmarks.java | 0 .../util/ByteFunctionHelpersBenchmarks.java | 3 +- .../vector/BaseValueVectorBenchmarks.java | 0 .../vector/BitVectorHelperBenchmarks.java | 2 ++ .../arrow/vector/DecimalVectorBenchmarks.java | 0 .../apache/arrow/vector/Float8Benchmarks.java | 2 ++ .../arrow/vector/FloatingPointBenchmarks.java | 2 ++ .../apache/arrow/vector/IntBenchmarks.java | 2 ++ .../arrow/vector/VarCharBenchmarks.java | 2 ++ .../vector/VariableWidthVectorBenchmarks.java | 2 ++ .../arrow/vector/VectorLoaderBenchmark.java | 2 ++ .../arrow/vector/VectorUnloaderBenchmark.java | 2 ++ .../DictionaryEncoderBenchmarks.java | 0 .../vector/ipc/WriteChannelBenchmark.java | 2 ++ .../message/ArrowRecordBatchBenchmarks.java | 2 ++ .../vector/util/TransferPairBenchmarks.java | 2 ++ 22 files changed, 43 insertions(+), 16 deletions(-) rename java/performance/src/{test => main}/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java (99%) rename java/performance/src/{test => main}/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/AllocatorBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/ArrowBufBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java (98%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java (98%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/Float8Benchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/FloatingPointBenchmarks.java (98%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/IntBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VarCharBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VectorLoaderBenchmark.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java (97%) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index c819e6393d78f..e9023ece080a3 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -40,61 +40,61 @@ org.openjdk.jmh jmh-core ${jmh.version} - test - - - org.openjdk.jmh - jmh-generator-annprocess - ${jmh.version} - provided org.apache.arrow arrow-vector ${arrow.vector.classifier} - test org.apache.arrow arrow-memory-core - test org.apache.arrow arrow-memory-netty - test + runtime org.apache.avro avro ${dep.avro.version} - test org.apache.arrow arrow-avro - test com.h2database h2 2.2.224 - test + runtime org.apache.arrow arrow-jdbc - test org.apache.arrow arrow-algorithm - test + + org.apache.maven.plugins + maven-compiler-plugin + + + + org.openjdk.jmh + jmh-generator-annprocess + ${jmh.version} + + + + org.apache.maven.plugins maven-shade-plugin diff --git a/java/performance/src/test/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java similarity index 99% rename from java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java index fd3940b4c872c..f6dab83b7cd0c 100644 --- a/java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java @@ -54,6 +54,7 @@ * Benchmarks for Jdbc adapter. */ public class JdbcAdapterBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VALUE_COUNT = 3000; @@ -355,5 +356,6 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java index 1c3af77e73a05..c9fc5cc4bef9c 100644 --- a/java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java @@ -43,6 +43,7 @@ * Benchmarks for {@link ParallelSearcher}. */ public class ParallelSearcherBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024 * 1024; @@ -112,4 +113,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/memory/AllocatorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/AllocatorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/ArrowBufBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/ArrowBufBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/ArrowBufBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/ArrowBufBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java index 4d0dfcb5da80d..f1dc2d79eff83 100644 --- a/java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java @@ -48,7 +48,7 @@ public class ByteFunctionHelpersBenchmarks { */ @State(Scope.Benchmark) public static class ArrowEqualState { - + // checkstyle:off: MissingJavadocMethod private static final int BUFFER_CAPACITY = 7; private BufferAllocator allocator; @@ -135,4 +135,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java index 5f6e5ca28fbab..e29b889c6e7a8 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java @@ -41,6 +41,7 @@ * Benchmarks for {@link BitVectorHelper}. */ public class BitVectorHelperBenchmarks { + // checkstyle:off: MissingJavadocMethod /** * State object for general benchmarks. @@ -226,4 +227,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java index 874e0d9f82ee7..36a633e5e1b6e 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java @@ -40,6 +40,7 @@ */ @State(Scope.Benchmark) public class Float8Benchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -119,4 +120,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java index 079672e9f2a98..2938591737f06 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class FloatingPointBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -130,5 +131,6 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java index 036768d445e55..99674058970a6 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class IntBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -107,4 +108,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java index 1ab4b7bc20dad..a7ce4e04fee87 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java @@ -39,6 +39,7 @@ */ @State(Scope.Benchmark) public class VarCharBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -99,4 +100,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java index 7eee981f13327..62c54606e6da6 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class VariableWidthVectorBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_CAPACITY = 16 * 1024; @@ -127,4 +128,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java index 416d126419e56..e8e8c0cfbc1f3 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java @@ -40,6 +40,7 @@ * Benchmarks for {@link VectorLoader}. */ public class VectorLoaderBenchmark { + // checkstyle:off: MissingJavadocMethod private static final int ALLOCATOR_CAPACITY = 1024 * 1024; @@ -114,4 +115,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java index d125172450004..b464f888fa85f 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class VectorUnloaderBenchmark { + // checkstyle:off: MissingJavadocMethod private static final int ALLOCATOR_CAPACITY = 1024 * 1024; @@ -106,4 +107,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java index 7a2537cbb8820..18efff11db9ff 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java @@ -41,6 +41,7 @@ * Benchmarks for {@link WriteChannel}. */ public class WriteChannelBenchmark { + // checkstyle:off: MissingJavadocMethod /** * State object for align benchmark. @@ -84,4 +85,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java index c0882821e9cc4..b608bb4c1c590 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java @@ -42,6 +42,7 @@ */ @State(Scope.Benchmark) public class ArrowRecordBatchBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_CAPACITY = 16 * 1024; @@ -95,4 +96,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java index 5142f4bdb8d0d..486862859f122 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java @@ -42,6 +42,7 @@ */ @State(Scope.Benchmark) public class TransferPairBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -120,4 +121,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } From 0f7e9af43796a81d126c59ee1342c6dbf8efaf08 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 30 Apr 2024 17:27:26 -0800 Subject: [PATCH 018/105] GH-39990: [Docs][CI] Add sphinx-lint for docs linting (#40022) ### What changes are included in this PR? This adds developer tooling to the repo for linting the docs by adding the sphinx-lint tool to archery and our pre-commit hooks. In both locations, only two rules are enabled at the moment (Discussed in https://github.com/apache/arrow/pull/40006): `trailing-whitespace` and `missing-final-newline`. This PR also fixes the individual issues covered by the new tooling. ### Are these changes tested? Yes, though manually. I tested this works by running `archery lint --docs` and `pre-commit` without and without changes that should get caught by the rules. It works as expected. ### Are there any user-facing changes? Yes, 1. Developers that use pre-commit hooks will see a change in behavior when they modify docs 2. Developers using archery will see a new --docs option in `archery lint` 3. Developers working on the docs may see CI failures related to the new checks * Closes: #39990 * GitHub Issue: #39990 Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- .pre-commit-config.yaml | 6 ++ ci/conda_env_sphinx.txt | 1 + dev/archery/archery/cli.py | 6 +- dev/archery/archery/utils/lint.py | 52 ++++++++++++- dev/archery/setup.py | 2 +- docs/requirements.txt | 1 + docs/source/cpp/acero/developer_guide.rst | 6 +- docs/source/cpp/acero/overview.rst | 8 +- docs/source/cpp/acero/substrait.rst | 46 ++++++------ docs/source/cpp/acero/user_guide.rst | 74 +++++++++---------- docs/source/cpp/api/scalar.rst | 2 +- docs/source/cpp/build_system.rst | 2 +- docs/source/cpp/compute.rst | 56 +++++++------- docs/source/cpp/dataset.rst | 22 +++--- docs/source/cpp/datatypes.rst | 14 ++-- .../examples/compute_and_write_example.rst | 6 +- docs/source/cpp/flight.rst | 4 +- docs/source/cpp/gandiva.rst | 26 +++---- .../cpp/gandiva/expr_projector_filter.rst | 26 +++---- docs/source/cpp/gandiva/external_func.rst | 14 ++-- docs/source/cpp/getting_started.rst | 12 ++- docs/source/cpp/memory.rst | 30 ++++---- docs/source/cpp/parquet.rst | 34 ++++----- docs/source/cpp/tables.rst | 6 +- docs/source/cpp/threading.rst | 2 +- .../source/cpp/tutorials/compute_tutorial.rst | 12 +-- .../cpp/tutorials/datasets_tutorial.rst | 30 ++++---- docs/source/cpp/tutorials/io_tutorial.rst | 20 ++--- .../continuous_integration/index.rst | 2 +- docs/source/developers/cpp/building.rst | 10 +-- docs/source/developers/cpp/windows.rst | 4 +- .../guide/architectural_overview.rst | 4 +- .../source/developers/guide/communication.rst | 4 +- .../source/developers/guide/documentation.rst | 3 +- docs/source/developers/guide/index.rst | 10 +-- docs/source/developers/guide/resources.rst | 2 +- .../guide/step_by_step/finding_issues.rst | 2 +- .../developers/guide/step_by_step/set_up.rst | 2 +- .../developers/guide/step_by_step/styling.rst | 2 +- .../developers/guide/tutorials/index.rst | 2 +- .../guide/tutorials/python_tutorial.rst | 36 ++++----- docs/source/developers/java/building.rst | 2 +- docs/source/developers/overview.rst | 3 +- docs/source/developers/release.rst | 18 ++--- .../developers/release_verification.rst | 2 +- docs/source/developers/reviewing.rst | 6 +- .../CDataInterface/PyCapsuleInterface.rst | 26 +++---- docs/source/format/Glossary.rst | 2 +- docs/source/format/Integration.rst | 8 +- docs/source/java/algorithm.rst | 28 +++---- docs/source/java/flight.rst | 2 +- docs/source/java/flight_sql_jdbc_driver.rst | 4 +- docs/source/java/memory.rst | 40 +++++----- docs/source/java/quickstartguide.rst | 2 +- docs/source/java/vector.rst | 4 +- docs/source/python/api/compute.rst | 8 +- docs/source/python/api/substrait.rst | 2 +- docs/source/python/compute.rst | 16 ++-- docs/source/python/dataset.rst | 54 +++++++------- docs/source/python/dlpack.rst | 2 +- docs/source/python/filesystems.rst | 4 +- docs/source/python/getstarted.rst | 16 ++-- docs/source/python/getting_involved.rst | 4 +- docs/source/python/integration/python_r.rst | 24 +++--- docs/source/python/ipc.rst | 10 +-- docs/source/python/json.rst | 2 +- docs/source/python/orc.rst | 2 +- docs/source/python/parquet.rst | 2 +- docs/source/python/timestamps.rst | 26 +++---- 69 files changed, 488 insertions(+), 434 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2e598e0a95064..bf5ca08d53c32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -132,3 +132,9 @@ repos: ?^cpp/cmake_modules/UseCython\.cmake$| ?^cpp/src/arrow/util/config\.h\.cmake$| ) + - repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v0.9.1 + hooks: + - id: sphinx-lint + files: ^docs/ + args: ['--disable', 'all', '--enable', 'trailing-whitespace,missing-final-newline', 'docs'] diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 0a356d5722c42..83afa69a653a9 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -26,6 +26,7 @@ pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinxcontrib-jquery sphinx==6.2 # Requirement for doctest-cython diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 5fa41e28a3208..8a26d9266f22d 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -261,6 +261,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs): "Check all sources files for license texts via Apache RAT."), LintCheck('r', "Lint R files."), LintCheck('docker', "Lint Dockerfiles with hadolint."), + LintCheck('docs', "Lint docs with sphinx-lint."), ] @@ -285,9 +286,10 @@ def decorate_lint_command(cmd): help="Run IWYU on all C++ files if enabled") @click.option("-a", "--all", is_flag=True, default=False, help="Enable all checks.") +@click.argument("path", required=False) @decorate_lint_command @click.pass_context -def lint(ctx, src, fix, iwyu_all, **checks): +def lint(ctx, src, fix, iwyu_all, path, **checks): if checks.pop('all'): # "--all" is given => enable all non-selected checks for k, v in checks.items(): @@ -297,7 +299,7 @@ def lint(ctx, src, fix, iwyu_all, **checks): raise click.UsageError( "Need to enable at least one lint check (try --help)") try: - linter(src, fix, iwyu_all=iwyu_all, **checks) + linter(src, fix, iwyu_all=iwyu_all, path=path, **checks) except LintValidationException: sys.exit(1) diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 15f22ca2e6e5c..108c9ded361e7 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -436,10 +436,55 @@ def docker_linter(src): cwd=src.path)) -def linter(src, fix=False, *, clang_format=False, cpplint=False, +class SphinxLint(Command): + def __init__(self, src, path=None, sphinx_lint_bin=None, disable=None, enable=None): + self.src = src + self.path = path + self.bin = default_bin(sphinx_lint_bin, "sphinx-lint") + self.disable = disable or "all" + self.enable = enable + + def lint(self, *args, check=False): + docs_path = os.path.join(self.src.path, "docs") + + args = [] + + if self.disable: + args.extend(["--disable", self.disable]) + + if self.enable: + args.extend(["--enable", self.enable]) + + if self.path is not None: + args.extend([self.path]) + else: + args.extend([docs_path]) + + return self.run(*args, check=check) + + +def docs_linter(src, path=None): + """Run sphinx-lint on docs.""" + logger.info("Running docs linter (sphinx-lint)") + + sphinx_lint = SphinxLint( + src, + path=path, + disable="all", + enable="trailing-whitespace,missing-final-newline" + ) + + if not sphinx_lint.available: + logger.error("sphinx-lint linter requested but sphinx-lint binary not found") + return + + yield LintResult.from_cmd(sphinx_lint.lint()) + + +def linter(src, fix=False, path=None, *, clang_format=False, cpplint=False, clang_tidy=False, iwyu=False, iwyu_all=False, python=False, numpydoc=False, cmake_format=False, rat=False, - r=False, docker=False): + r=False, docker=False, docs=False): """Run all linters.""" with tmpdir(prefix="arrow-lint-") as root: build_dir = os.path.join(root, "cpp-build") @@ -481,6 +526,9 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False, if docker: results.extend(docker_linter(src)) + if docs: + results.extend(docs_linter(src, path)) + # Raise error if one linter failed, ensuring calling code can exit with # non-zero. for result in results: diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 23a1600910d04..cd3e2e9ca0834 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -41,7 +41,7 @@ 'integration': ['cffi'], 'integration-java': ['jpype1'], 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', - 'cmake_format==0.6.13'], + 'cmake_format==0.6.13', 'sphinx-lint==0.9.1'], 'numpydoc': ['numpydoc==1.1.0'], 'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'], } diff --git a/docs/requirements.txt b/docs/requirements.txt index 252344a74a58f..8891680814dff 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,5 +10,6 @@ pydata-sphinx-theme~=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinx==6.2 pandas diff --git a/docs/source/cpp/acero/developer_guide.rst b/docs/source/cpp/acero/developer_guide.rst index 331cd833b58af..80ca68556fc40 100644 --- a/docs/source/cpp/acero/developer_guide.rst +++ b/docs/source/cpp/acero/developer_guide.rst @@ -187,7 +187,7 @@ Examples task (described below) as completed which allows the plan to finish. * The ``fetch`` node, in ``InputReceived``, may decide that it has all the data it needs. It can then call ``StopProducing`` on its input. - + Initialization / Construction / Destruction ------------------------------------------- @@ -271,7 +271,7 @@ distributed systems. Once that has been done then it should be possible to do a meaning exchanging between multiple exec plan instances on a single system) if desired. .. figure:: dist_plan.svg - + A distributed plan can provide parallelism even if the plans themselves run serially Pipeline Parallelism @@ -472,7 +472,7 @@ Benchmarking The most complete macro benchmarking for Acero is provided by https://github.com/voltrondata-labs/arrowbench These include a set of TPC-H benchmarks, executed from the R-dplyr integration, which are run on every Arrow commit and -reported to Conbench at https://conbench.ursa.dev/ +reported to Conbench at https://conbench.ursa.dev/ In addition to these TPC-H benchmarks there are a number of micro-benchmarks for various nodes (hash-join, asof-join, etc.) Finally, the compute functions themselves should mostly have micro-benchmarks. For more on micro benchmarks you diff --git a/docs/source/cpp/acero/overview.rst b/docs/source/cpp/acero/overview.rst index c569f82b099b6..8be4cbc1b1772 100644 --- a/docs/source/cpp/acero/overview.rst +++ b/docs/source/cpp/acero/overview.rst @@ -206,7 +206,7 @@ is very similar to a RecordBatch. It can have zero or more columns and all of t must have the same length. There are a few key differences from ExecBatch: .. figure:: rb_vs_eb.svg - + Both the record batch and the exec batch have strong ownership of the arrays & buffers * An `ExecBatch` does not have a schema. This is because an `ExecBatch` is assumed to be @@ -217,7 +217,7 @@ must have the same length. There are a few key differences from ExecBatch: also has a length property which describes how many rows are in a batch. So another way to view a `Scalar` is a constant array with `length` elements. * An `ExecBatch` contains additional information used by the exec plan. For example, an - `index` can be used to describe a batch's position in an ordered stream. We expect + `index` can be used to describe a batch's position in an ordered stream. We expect that `ExecBatch` will also evolve to contain additional fields such as a selection vector. .. figure:: scalar_vs_array.svg @@ -266,5 +266,5 @@ various query representations (e.g. Substrait). The Declaration objects are the with the DeclarationToXyz methods, are the current public API for Acero. .. figure:: decl_vs_ep.svg - - A declaration is a blueprint that is used to instantiate exec plan instances \ No newline at end of file + + A declaration is a blueprint that is used to instantiate exec plan instances diff --git a/docs/source/cpp/acero/substrait.rst b/docs/source/cpp/acero/substrait.rst index 797b2407f93cd..a5532733627c1 100644 --- a/docs/source/cpp/acero/substrait.rst +++ b/docs/source/cpp/acero/substrait.rst @@ -111,7 +111,7 @@ Aggregate Relations * Each measure's arguments must be direct references. * A measure may not have a filter * A measure may not have sorts -* A measure's invocation must be AGGREGATION_INVOCATION_ALL or +* A measure's invocation must be AGGREGATION_INVOCATION_ALL or AGGREGATION_INVOCATION_UNSPECIFIED * A measure's phase must be AGGREGATION_PHASE_INITIAL_TO_RESULT @@ -146,73 +146,73 @@ Types - Caveat * - boolean - boolean - - + - * - i8 - int8 - - + - * - i16 - int16 - - + - * - i32 - int32 - - + - * - i64 - int64 - - + - * - fp32 - float32 - - + - * - fp64 - float64 - - + - * - string - string - - + - * - binary - binary - - + - * - timestamp - timestamp - - + - * - timestamp_tz - timestamp - - + - * - date - date32 - - + - * - time - time64 - - + - * - interval_year - - + - - Not currently supported * - interval_day - - + - - Not currently supported * - uuid - - + - - Not currently supported * - FIXEDCHAR - - + - - Not currently supported * - VARCHAR - - + - - Not currently supported * - FIXEDBINARY - fixed_size_binary - - + - * - DECIMAL - decimal128 - - + - * - STRUCT - struct - Arrow struct fields will have no name (empty string) * - NSTRUCT - - + - - Not currently supported * - LIST - list - - + - * - MAP - map - K must not be nullable diff --git a/docs/source/cpp/acero/user_guide.rst b/docs/source/cpp/acero/user_guide.rst index eca1a0104708b..adcc17216e5ae 100644 --- a/docs/source/cpp/acero/user_guide.rst +++ b/docs/source/cpp/acero/user_guide.rst @@ -32,14 +32,14 @@ Using Acero The basic workflow for Acero is this: #. First, create a graph of :class:`Declaration` objects describing the plan - + #. Call one of the DeclarationToXyz methods to execute the Declaration. a. A new ExecPlan is created from the graph of Declarations. Each Declaration will correspond to one ExecNode in the plan. In addition, a sink node will be added, depending on which DeclarationToXyz method was used. - b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in + b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in DeclarationToReader the reader is returned before the plan is finished executing. c. Once the plan is finished it is destroyed @@ -315,7 +315,7 @@ of a specific execution node. ``source`` ---------- -A ``source`` operation can be considered as an entry point to create a streaming execution plan. +A ``source`` operation can be considered as an entry point to create a streaming execution plan. :class:`SourceNodeOptions` are used to create the ``source`` operation. The ``source`` operation is the most generic and flexible type of source currently available but it can be quite tricky to configure. First you should review the other source node types to ensure there @@ -326,7 +326,7 @@ function should take no arguments and should return an ``arrow::Future>``. This function might be reading a file, iterating through an in memory structure, or receiving data from a network connection. The arrow library refers to these functions as ``arrow::AsyncGenerator`` -and there are a number of utilities for working with these functions. For this example we use +and there are a number of utilities for working with these functions. For this example we use a vector of record batches that we've already stored in memory. In addition, the schema of the data must be known up front. Acero must know the schema of the data at each stage of the execution graph before any processing has begun. This means we must supply the @@ -368,10 +368,10 @@ Example of using ``source`` (usage of sink is explained in detail in :ref:`sink< In the previous example, :ref:`source node `, a source node was used to input the data. But when developing an application, if the data is already in memory as a table, it is much easier, and more performant to use :class:`TableSourceNodeOptions`. -Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. +Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. The ``max_batch_size`` is to break up large record batches so that they can be processed in parallel. It is important to note that the table batches will not get merged to form larger batches when the source -table has a smaller batch size. +table has a smaller batch size. Example of using ``table_source`` @@ -387,7 +387,7 @@ Example of using ``table_source`` ``filter`` ---------- -``filter`` operation, as the name suggests, provides an option to define data filtering +``filter`` operation, as the name suggests, provides an option to define data filtering criteria. It selects rows where the given expression evaluates to true. Filters can be written using :class:`arrow::compute::Expression`, and the expression should have a return type of boolean. For example, if we wish to keep rows where the value @@ -415,7 +415,7 @@ functions, i.e. elementwise functions that return one value for each input row independent of the value of all other rows). This is exposed via :class:`ProjectNodeOptions` which requires, an :class:`arrow::compute::Expression` and name for each of the output columns (if names are not -provided, the string representations of exprs will be used). +provided, the string representations of exprs will be used). Project example: @@ -456,7 +456,7 @@ can be selected from :ref:`this list of aggregation functions The aggregation can provide results as a group or scalar. For instances, an operation like `hash_count` provides the counts per each unique record -as a grouped result while an operation like `sum` provides a single record. +as a grouped result while an operation like `sum` provides a single record. Scalar Aggregation example: @@ -481,14 +481,14 @@ Group Aggregation example: ``sink`` -------- -``sink`` operation provides output and is the final node of a streaming -execution definition. :class:`SinkNodeOptions` interface is used to pass +``sink`` operation provides output and is the final node of a streaming +execution definition. :class:`SinkNodeOptions` interface is used to pass the required options. Similar to the source operator the sink operator exposes the output with a function that returns a record batch future each time it is called. It is expected the caller will repeatedly call this function until the generator function is exhausted (returns ``std::optional::nullopt``). If this function is not called often enough then record batches will accumulate in memory. An execution plan should only have one -"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or +"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or an error, before the output is fully consumed. However, the plan can be safely destroyed independently of the sink, which will hold the unconsumed batches by `exec_plan->finished()`. @@ -526,12 +526,12 @@ Example:: arrow::Future<> finish = arrow::Future<>::Make(); struct CustomSinkNodeConsumer : public cp::SinkNodeConsumer { - CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): + CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): batches_seen(batches_seen), finish(std::move(finish)) {} // Consumption logic can be written here arrow::Status Consume(cp::ExecBatch batch) override { // data can be consumed in the expected way - // transfer to another system or just do some work + // transfer to another system or just do some work // and write to disk (*batches_seen)++; return arrow::Status::OK(); @@ -541,9 +541,9 @@ Example:: std::atomic *batches_seen; arrow::Future<> finish; - + }; - + std::shared_ptr consumer = std::make_shared(&batches_seen, finish); @@ -567,14 +567,14 @@ Consuming-Sink example: ``order_by_sink`` ----------------- -``order_by_sink`` operation is an extension to the ``sink`` operation. -This operation provides the ability to guarantee the ordering of the -stream by providing the :class:`OrderBySinkNodeOptions`. -Here the :class:`arrow::compute::SortOptions` are provided to define which columns +``order_by_sink`` operation is an extension to the ``sink`` operation. +This operation provides the ability to guarantee the ordering of the +stream by providing the :class:`OrderBySinkNodeOptions`. +Here the :class:`arrow::compute::SortOptions` are provided to define which columns are used for sorting and whether to sort by ascending or descending values. .. note:: This node is a "pipeline breaker" and will fully materialize the dataset in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. @@ -593,14 +593,14 @@ Order-By-Sink example: ``select_k_sink`` ----------------- -``select_k_sink`` option enables selecting the top/bottom K elements, -similar to a SQL ``ORDER BY ... LIMIT K`` clause. -:class:`SelectKOptions` which is a defined by -using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives +``select_k_sink`` option enables selecting the top/bottom K elements, +similar to a SQL ``ORDER BY ... LIMIT K`` clause. +:class:`SelectKOptions` which is a defined by +using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives inputs and then compute top_k/bottom_k. .. note:: This node is a "pipeline breaker" and will fully materialize the input in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. SelectK example: @@ -617,7 +617,7 @@ SelectK example: .. _stream_execution_table_sink_docs: -The ``table_sink`` node provides the ability to receive the output as an in-memory table. +The ``table_sink`` node provides the ability to receive the output as an in-memory table. This is simpler to use than the other sink nodes provided by the streaming execution engine but it only makes sense when the output fits comfortably in memory. The node is created using :class:`TableSinkNodeOptions`. @@ -637,7 +637,7 @@ Example of using ``table_sink`` --------- ``scan`` is an operation used to load and process datasets. It should be preferred over the -more generic ``source`` node when your input is a dataset. The behavior is defined using +more generic ``source`` node when your input is a dataset. The behavior is defined using :class:`arrow::dataset::ScanNodeOptions`. More information on datasets and the various scan options can be found in :doc:`../dataset`. @@ -683,10 +683,10 @@ Write example: ``union`` ------------- -``union`` merges multiple data streams with the same schema into one, similar to +``union`` merges multiple data streams with the same schema into one, similar to a SQL ``UNION ALL`` clause. -The following example demonstrates how this can be achieved using +The following example demonstrates how this can be achieved using two data sources. Union example: @@ -704,15 +704,15 @@ Union example: ------------- ``hash_join`` operation provides the relational algebra operation, join using hash-based -algorithm. :class:`HashJoinNodeOptions` contains the options required in -defining a join. The hash_join supports +algorithm. :class:`HashJoinNodeOptions` contains the options required in +defining a join. The hash_join supports `left/right/full semi/anti/outerjoins -`_. +`_. Also the join-key (i.e. the column(s) to join on), and suffixes (i.e a suffix term like "_x" -which can be appended as a suffix for column names duplicated in both left and right -relations.) can be set via the join options. +which can be appended as a suffix for column names duplicated in both left and right +relations.) can be set via the join options. `Read more on hash-joins -`_. +`_. Hash-Join example: @@ -726,7 +726,7 @@ Hash-Join example: Summary ======= -There are examples of these nodes which can be found in +There are examples of these nodes which can be found in ``cpp/examples/arrow/execution_plan_documentation_examples.cc`` in the Arrow source. Complete Example: diff --git a/docs/source/cpp/api/scalar.rst b/docs/source/cpp/api/scalar.rst index 04e78450d7744..be9f9686bf110 100644 --- a/docs/source/cpp/api/scalar.rst +++ b/docs/source/cpp/api/scalar.rst @@ -44,4 +44,4 @@ Utilities .. doxygenclass:: arrow::ScalarVisitor :project: arrow_cpp :members: - :undoc-members: \ No newline at end of file + :undoc-members: diff --git a/docs/source/cpp/build_system.rst b/docs/source/cpp/build_system.rst index 60df117eb510e..0c94d7e5ce5dc 100644 --- a/docs/source/cpp/build_system.rst +++ b/docs/source/cpp/build_system.rst @@ -47,7 +47,7 @@ file into an executable linked with the Arrow C++ shared library: .. code-block:: cmake cmake_minimum_required(VERSION 3.16) - + project(MyExample) find_package(Arrow REQUIRED) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index e7310d2c0c711..546b6e5716df7 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -49,8 +49,8 @@ Computation inputs are represented as a general :class:`Datum` class, which is a tagged union of several shapes of data such as :class:`Scalar`, :class:`Array` and :class:`ChunkedArray`. Many compute functions support both array (chunked or not) and scalar inputs, however some will mandate -particular input types. For example, while ``array_sort_indices`` requires its -first and only input to be an array, the generalized ``sort_indices`` +particular input types. For example, while ``array_sort_indices`` requires its +first and only input to be an array, the generalized ``sort_indices`` function accepts an array, chunked array, record batch or table. .. _invoking-compute-functions: @@ -572,28 +572,28 @@ representation based on the rounding criterion. | trunc | Unary | Numeric | Float32/Float64/Decimal | | | +-------------------+------------+-------------+-------------------------+----------------------------------+--------+ -* \(1) By default rounding functions change a value to the nearest - integer using HALF_TO_EVEN to resolve ties. Options are available to control - the rounding criterion. All ``round`` functions have the +* \(1) By default rounding functions change a value to the nearest + integer using HALF_TO_EVEN to resolve ties. Options are available to control + the rounding criterion. All ``round`` functions have the ``round_mode`` option to set the rounding mode. * \(2) Round to a number of digits where the ``ndigits`` option of :struct:`RoundOptions` specifies the rounding precision in terms of number of digits. A negative value corresponds to digits in the non-fractional part. For example, -2 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0 - which rounds to the nearest integer. For integer inputs a non-negative + which rounds to the nearest integer. For integer inputs a non-negative ``ndigits`` value is ignored and the input is returned unchanged. For integer - inputs, if ``-ndigits`` is larger than the maximum number of digits the + inputs, if ``-ndigits`` is larger than the maximum number of digits the input type can hold, an error is returned. * \(3) Round to a multiple where the ``multiple`` option of :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding - multiple has to be a positive value and can be casted to input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 - (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which + multiple has to be a positive value and can be casted to input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 + (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which rounds to the nearest integer. * \(4) Round the first input to multiple of the second input. The rounding - multiple has to be a positive value and can be casted to the first input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 + multiple has to be a positive value and can be casted to the first input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). For ``round`` functions, the following rounding modes are available. @@ -634,8 +634,8 @@ The example values are given for default values of ``ndigits`` and ``multiple``. | | | -3.5 -> -3, -4.5 -> -5 | +-----------------------+--------------------------------------------------------------+---------------------------+ -The following table gives examples of how ``ndigits`` (for the ``round`` -and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) +The following table gives examples of how ``ndigits`` (for the ``round`` +and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) influence the operation performed, respectively. +--------------------+-------------------+---------------------------+ @@ -1621,12 +1621,12 @@ Array-wise ("vector") functions Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions -do not detect overflow. They are also available in an overflow-checking variant, -suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are also available in an overflow-checking variant, +suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ @@ -1649,8 +1649,8 @@ overflow is detected. * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running - accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of - input type for `max`, and max of input type for `min`. Specified values of + accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of + input type for `max`, and max of input type for `min`. Specified values of ``start`` must be castable to the input type. The second parameter :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to @@ -1861,9 +1861,9 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ -Pairwise functions are unary vector functions that perform a binary operation on +Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th -output is computed by applying the binary operation to the n-th and (n-p)-th inputs, +output is computed by applying the binary operation to the n-th and (n-p)-th inputs, where p is the period. The default period is 1, in which case the binary operation is applied to adjacent pairs of inputs. The period can also be negative, in which case the n-th output is computed by applying the binary @@ -1877,9 +1877,9 @@ operation to the n-th and (n+abs(p))-th inputs. | pairwise_diff_checked | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(3) | +------------------------+-------+----------------------+----------------------+--------------------------------+----------+ -* \(1) Computes the first order difference of an array, It internally calls - the scalar function ``Subtract`` (or the checked variant) to compute - differences, so its behavior and supported types are the same as - ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. +* \(1) Computes the first order difference of an array, It internally calls + the scalar function ``Subtract`` (or the checked variant) to compute + differences, so its behavior and supported types are the same as + ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. * \(2) Wraps around the result when overflow is detected. * \(3) Returns an ``Invalid`` :class:`Status` when overflow is detected. diff --git a/docs/source/cpp/dataset.rst b/docs/source/cpp/dataset.rst index 1f5d0476c2889..a64b73b61c05d 100644 --- a/docs/source/cpp/dataset.rst +++ b/docs/source/cpp/dataset.rst @@ -378,28 +378,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst index 4e1fe76b4d6f2..7eb70936f4e1d 100644 --- a/docs/source/cpp/datatypes.rst +++ b/docs/source/cpp/datatypes.rst @@ -72,8 +72,8 @@ To instantiate data types, it is recommended to call the provided Type Traits ----------- -Writing code that can handle concrete :class:`arrow::DataType` subclasses would -be verbose, if it weren't for type traits. Arrow's type traits map the Arrow +Writing code that can handle concrete :class:`arrow::DataType` subclasses would +be verbose, if it weren't for type traits. Arrow's type traits map the Arrow data types to the specialized array, scalar, builder, and other associated types. For example, the Boolean type has traits: @@ -96,7 +96,7 @@ For example, the Boolean type has traits: See the :ref:`type-traits` for an explanation of each of these fields. Using type traits, one can write template functions that can handle a variety -of Arrow types. For example, to write a function that creates an array of +of Arrow types. For example, to write a function that creates an array of Fibonacci values for any Arrow numeric type: .. code-block:: cpp @@ -128,7 +128,7 @@ For some common cases, there are type associations on the classes themselves. Us Similar to the type traits provided in `std::type_traits `_, -Arrow provides type predicates such as ``is_number_type`` as well as +Arrow provides type predicates such as ``is_number_type`` as well as corresponding templates that wrap ``std::enable_if_t`` such as ``enable_if_number``. These can constrain template functions to only compile for relevant types, which is useful if other overloads need to be implemented. For example, to write a sum @@ -176,20 +176,20 @@ here is how one might sum across columns of arbitrary numeric types: class TableSummation { double partial = 0.0; public: - + arrow::Result Compute(std::shared_ptr batch) { for (std::shared_ptr array : batch->columns()) { ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this)); } return partial; } - + // Default implementation arrow::Status Visit(const arrow::Array& array) { return arrow::Status::NotImplemented("Cannot compute sum for array of type ", array.type()->ToString()); } - + template arrow::enable_if_number Visit(const ArrayType& array) { for (std::optional value : array) { diff --git a/docs/source/cpp/examples/compute_and_write_example.rst b/docs/source/cpp/examples/compute_and_write_example.rst index e66d3ced55d0c..a4b619f7ffff3 100644 --- a/docs/source/cpp/examples/compute_and_write_example.rst +++ b/docs/source/cpp/examples/compute_and_write_example.rst @@ -21,8 +21,8 @@ Compute and Write CSV Example ============================= -The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside -the source tree contains an example of creating a table of two numerical columns -and then comparing the magnitudes of the entries in the columns and writing out to +The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside +the source tree contains an example of creating a table of two numerical columns +and then comparing the magnitudes of the entries in the columns and writing out to a CSV file with the column entries and their comparisons. The code in the example is documented. diff --git a/docs/source/cpp/flight.rst b/docs/source/cpp/flight.rst index e07a84e91ee4f..a1e9420bfd34e 100644 --- a/docs/source/cpp/flight.rst +++ b/docs/source/cpp/flight.rst @@ -350,10 +350,10 @@ Closing unresponsive connections calls Cancel() on a timer, with the main thread resetting the timer every time an operation completes successfully. For a fully-worked out example, see the Cookbook. - + .. note:: There is a long standing ticket for a per-write/per-read timeout instead of a per call timeout (ARROW-6062_), but this is not (easily) - possible to implement with the blocking gRPC API. + possible to implement with the blocking gRPC API. .. _best gRPC practices: https://grpc.io/docs/guides/performance/#general .. _gRPC keys: https://grpc.github.io/grpc/cpp/group__grpc__arg__keys.html diff --git a/docs/source/cpp/gandiva.rst b/docs/source/cpp/gandiva.rst index 07b07bee7ac4e..f60d1fc8ac8d9 100644 --- a/docs/source/cpp/gandiva.rst +++ b/docs/source/cpp/gandiva.rst @@ -29,8 +29,8 @@ Gandiva only handles projections and filters; for other transformations, see :ref:`Compute Functions `. Gandiva was designed to take advantage of the Arrow memory format and modern -hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and -validity bitmaps, values and their null status can often be processed +hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and +validity bitmaps, values and their null status can often be processed independently, allowing for better instruction pipelining. On modern hardware, compiling expressions using LLVM allows the execution to be optimized to the local runtime environment and hardware, including available SIMD @@ -42,25 +42,25 @@ pre-compiled into LLVM IR (intermediate representation). Expression, Projector and Filter ================================ -To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, -including the creation of function nodes, if-else logic, and boolean expressions. +To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, +including the creation of function nodes, if-else logic, and boolean expressions. Subsequently, leverage ``Projector`` or ``Filter`` execution kernels to efficiently evaluate these expressions. -See :doc:`./gandiva/expr_projector_filter` for more details. +See :doc:`./gandiva/expr_projector_filter` for more details. External Functions Development ============================== -Gandiva offers the capability of integrating external functions, encompassing -both C functions and IR functions. This feature broadens the spectrum of -functions that can be applied within Gandiva expressions. For developers -looking to customize and enhance their computational solutions, -Gandiva provides the opportunity to develop and register their own external -functions, thus allowing for a more tailored and flexible use of the Gandiva +Gandiva offers the capability of integrating external functions, encompassing +both C functions and IR functions. This feature broadens the spectrum of +functions that can be applied within Gandiva expressions. For developers +looking to customize and enhance their computational solutions, +Gandiva provides the opportunity to develop and register their own external +functions, thus allowing for a more tailored and flexible use of the Gandiva environment. -See :doc:`./gandiva/external_func` for more details. +See :doc:`./gandiva/external_func` for more details. .. toctree:: :maxdepth: 2 gandiva/expr_projector_filter - gandiva/external_func \ No newline at end of file + gandiva/external_func diff --git a/docs/source/cpp/gandiva/expr_projector_filter.rst b/docs/source/cpp/gandiva/expr_projector_filter.rst index c960d1d869fe5..9d58b185032e3 100644 --- a/docs/source/cpp/gandiva/expr_projector_filter.rst +++ b/docs/source/cpp/gandiva/expr_projector_filter.rst @@ -30,7 +30,7 @@ literal values, created by :func:`TreeExprBuilder::MakeLiteral`. Nodes can be combined into more complex expression trees using: * :func:`TreeExprBuilder::MakeFunction` to create a function - node. (You can call :func:`GetRegisteredFunctionSignatures` to + node. (You can call :func:`GetRegisteredFunctionSignatures` to get a list of valid function signatures.) * :func:`TreeExprBuilder::MakeIf` to create if-else logic. * :func:`TreeExprBuilder::MakeAnd` and :func:`TreeExprBuilder::MakeOr` @@ -39,7 +39,7 @@ can be combined into more complex expression trees using: functions to create set membership tests. Each of these functions create new composite nodes, which contain the leaf nodes -(literals and field references) or other composite nodes as children. By +(literals and field references) or other composite nodes as children. By composing these, you can create arbitrarily complex expression trees. Once an expression tree is built, they are wrapped in either :class:`Expression` @@ -84,7 +84,7 @@ reused to process distinct record batches in parallel. Evaluating projections ---------------------- -Execution is performed with :func:`Projector::Evaluate`. This outputs +Execution is performed with :func:`Projector::Evaluate`. This outputs a vector of arrays, which can be passed along with the output schema to :func:`arrow::RecordBatch::Make()`. @@ -99,14 +99,14 @@ Evaluating filters :func:`Filter::Evaluate` produces :class:`SelectionVector`, a vector of row indices that matched the filter condition. The selection vector -is a wrapper around an arrow integer array, parameterized by bitwidth. When -creating the selection vector (you must initialize it *before* passing to -``Evaluate()``), you must choose the bitwidth, which determines the max index +is a wrapper around an arrow integer array, parameterized by bitwidth. When +creating the selection vector (you must initialize it *before* passing to +``Evaluate()``), you must choose the bitwidth, which determines the max index value it can hold, and the max number of slots, which determines how many indices -it may contain. In general, the max number of slots should be set to your batch -size and the bitwidth the smallest integer size that can represent all integers -less than the batch size. For example, if your batch size is 100k, set the -maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which +it may contain. In general, the max number of slots should be set to your batch +size and the bitwidth the smallest integer size that can represent all integers +less than the batch size. For example, if your batch size is 100k, set the +maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which would be too small). Once ``Evaluate()`` has been run and the :class:`SelectionVector` is @@ -123,10 +123,10 @@ output record batch. Evaluating projections and filters ---------------------------------- -Finally, you can also project while apply a selection vector, with +Finally, you can also project while apply a selection vector, with :func:`Projector::Evaluate()`. To do so, first make sure to initialize the :class:`Projector` with :func:`SelectionVector::GetMode()` so that the projector -compiles with the correct bitwidth. Then you can pass the +compiles with the correct bitwidth. Then you can pass the :class:`SelectionVector` into the :func:`Projector::Evaluate()` method. @@ -134,4 +134,4 @@ compiles with the correct bitwidth. Then you can pass the :language: cpp :start-after: (Doc section: Evaluate filter and projection) :end-before: (Doc section: Evaluate filter and projection) - :dedent: 2 \ No newline at end of file + :dedent: 2 diff --git a/docs/source/cpp/gandiva/external_func.rst b/docs/source/cpp/gandiva/external_func.rst index cdd8fc82e59db..f8bdde83d96e6 100644 --- a/docs/source/cpp/gandiva/external_func.rst +++ b/docs/source/cpp/gandiva/external_func.rst @@ -79,7 +79,7 @@ The ``NativeFunction`` class is used to define the metadata for an external func * ``ResultNullableType::kResultNullIfNull``: result validity is an intersection of the validity of the children. * ``ResultNullableType::kResultNullNever``: result is always valid. * ``ResultNullableType::kResultNullInternal``: result validity depends on some internal logic. -* ``pc_name``: The name of the corresponding precompiled function. +* ``pc_name``: The name of the corresponding precompiled function. * Typically, this name follows the convention ``{base_name}`` + ``_{param1_type}`` + ``{param2_type}`` + ... + ``{paramN_type}``. For example, if the base name is ``add`` and the function takes two ``int32`` parameters and returns an ``int32``, the precompiled function name would be ``add_int32_int32``, but this convention is not mandatory as long as you can guarantee its uniqueness. * ``flags``: Optional flags for additional function attributes (default is 0). Please check out ``NativeFunction::kNeedsContext``, ``NativeFunction::kNeedsFunctionHolder``, and ``NativeFunction::kCanReturnErrors`` for more details. @@ -153,10 +153,10 @@ Not all Arrow data types are supported in Gandiva. The following table lists the | utf8 (as return type) | int64_t context, | | | const char*, | | | uint32_t* | -| | [see next section]| +| | [see next section]| +-------------------------------------+-------------------+ -Handling arrow::StringType (utf8 type) and arrow::BinaryType +Handling arrow::StringType (utf8 type) and arrow::BinaryType ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Both ``arrow::StringType`` and ``arrow::BinaryType`` are variable-length types. And they are handled similarly in external functions. Since ``arrow::StringType`` (utf8 type) is more commonly used, we will use it below as the example to explain how to handle variable-length types in external functions. @@ -179,7 +179,7 @@ When ``arrow::StringType`` (``utf8`` type) is used as the return type in a funct 2. **Function Parameters:** * **Context Parameter**: The C function should begin with an additional parameter, ``int64_t context``. This parameter is crucial for context management within the function. * **String Length Output Parameter**: The function should also include a ``uint32_t*`` parameter at the end. This output parameter will store the length of the returned string data. -3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. +3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. 4. **Function Implementation:** * **Memory Allocation and Error Messaging:** Within the function's implementation, use ``gdv_fn_context_arena_malloc`` and ``gdv_fn_context_set_error_msg`` for memory allocation and error messaging, respectively. Both functions take ``int64_t context`` as their first parameter, facilitating efficient context utilization. @@ -200,10 +200,10 @@ You can use ``gandiva::FunctionRegistry``'s APIs to register external C function NativeFunction func, void* c_function_ptr, std::optional function_holder_maker = std::nullopt); -The above API allows you to register an external C function. +The above API allows you to register an external C function. -* The ``NativeFunction`` object describes the metadata of the external C function. -* The ``c_function_ptr`` is the function pointer to the external C function's implementation. +* The ``NativeFunction`` object describes the metadata of the external C function. +* The ``c_function_ptr`` is the function pointer to the external C function's implementation. * The optional ``function_holder_maker`` is used to create a function holder for the external C function if the external C function requires a function holder. Check out the ``gandiva::FunctionHolder`` class and its several sub-classes for more details. External IR functions diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 89bd4559ef1e6..2cab5d1581c1c 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -24,17 +24,17 @@ Getting Started The following articles demonstrate installation, use, and a basic understanding of Arrow. These articles will get you setup quickly using Arrow and give you a taste of what the library is capable of. -Specifically, it contains: an installation and linking guide; documentation of conventions used -in the codebase and suggested for users; and tutorials, including: +Specifically, it contains: an installation and linking guide; documentation of conventions used +in the codebase and suggested for users; and tutorials, including: -* Building Arrow arrays and tabular structures +* Building Arrow arrays and tabular structures * Reading and writing Parquet, Arrow, and CSV files * Executing compute kernels on arrays * Reading and writing multi-file partitioned datasets Start here to gain a basic understanding of Arrow, and move on to the :doc:`/cpp/user_guide` to -explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's -API. +explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's +API. .. toctree:: @@ -44,5 +44,3 @@ API. tutorials/io_tutorial.rst tutorials/compute_tutorial.rst tutorials/datasets_tutorial.rst - - diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst index ad8276e3728a2..33907b5580f61 100644 --- a/docs/source/cpp/memory.rst +++ b/docs/source/cpp/memory.rst @@ -205,7 +205,7 @@ simply do:: Memory Profiling ================ -On Linux, detailed profiles of memory allocations can be generated using +On Linux, detailed profiles of memory allocations can be generated using ``perf record``, without any need to modify the binaries. These profiles can show the traceback in addition to allocation size. This does require debug symbols, from either a debug build or a release with debug symbols build. @@ -234,14 +234,14 @@ recorded allocations, so we can correlate them with the call to free/de-allocate .. tab-set:: .. tab-item:: jemalloc - + .. code-block:: shell - perf probe -x libarrow.so je_arrow_mallocx '$params' - perf probe -x libarrow.so je_arrow_mallocx%return '$retval' - perf probe -x libarrow.so je_arrow_rallocx '$params' - perf probe -x libarrow.so je_arrow_rallocx%return '$retval' - perf probe -x libarrow.so je_arrow_dallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx%return '$retval' + perf probe -x libarrow.so je_arrow_rallocx '$params' + perf probe -x libarrow.so je_arrow_rallocx%return '$retval' + perf probe -x libarrow.so je_arrow_dallocx '$params' PROBE_ARGS="-e probe_libarrow:je_arrow_mallocx \ -e probe_libarrow:je_arrow_mallocx__return \ -e probe_libarrow:je_arrow_rallocx \ @@ -249,13 +249,13 @@ recorded allocations, so we can correlate them with the call to free/de-allocate -e probe_libarrow:je_arrow_dallocx" .. tab-item:: mimalloc - + .. code-block:: shell - perf probe -x libarrow.so mi_malloc_aligned '$params' - perf probe -x libarrow.so mi_malloc_aligned%return '$retval' - perf probe -x libarrow.so mi_realloc_aligned '$params' - perf probe -x libarrow.so mi_realloc_aligned%return '$retval' + perf probe -x libarrow.so mi_malloc_aligned '$params' + perf probe -x libarrow.so mi_malloc_aligned%return '$retval' + perf probe -x libarrow.so mi_realloc_aligned '$params' + perf probe -x libarrow.so mi_realloc_aligned%return '$retval' perf probe -x libarrow.so mi_free '$params' PROBE_ARGS="-e probe_libarrow:mi_malloc_aligned \ -e probe_libarrow:mi_malloc_aligned__return \ @@ -277,9 +277,9 @@ If you want to profile a running process, you can run ``perf record -p `` and it will record until you interrupt with CTRL+C. Alternatively, you can do ``perf record -P sleep 10`` to record for 10 seconds. -The resulting data can be processed with standard tools to work with perf or +The resulting data can be processed with standard tools to work with perf or ``perf script`` can be used to pipe a text format of the data to custom scripts. -The following script parses ``perf script`` output and prints the output in +The following script parses ``perf script`` output and prints the output in new lines delimited JSON for easier processing. .. code-block:: python @@ -354,7 +354,7 @@ Here's an example invocation of that script, with a preview of output data: From there one can answer a number of questions. For example, the following -script will find which allocations were never freed, and print the associated +script will find which allocations were never freed, and print the associated tracebacks along with the count of dangling allocations: .. code-block:: python diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 3e06352f5dde3..96897d139b351 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -51,8 +51,8 @@ FileReader ---------- To read Parquet data into Arrow structures, use :class:`arrow::FileReader`. -To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance -representing the input file. To read the whole file at once, +To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance +representing the input file. To read the whole file at once, use :func:`arrow::FileReader::ReadTable`: .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -67,7 +67,7 @@ Finer-grained options are available through the and :class:`ArrowReaderProperties` classes. For reading as a stream of batches, use the :func:`arrow::FileReader::GetRecordBatchReader` -method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch +method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch size set in :class:`ArrowReaderProperties`. .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -106,8 +106,8 @@ If memory efficiency is more important than performance, then: #. Turn on ``enable_buffered_stream`` in :class:`parquet::ReaderProperties`. In addition, if you know certain columns contain many repeated values, you can -read them as :term:`dictionary encoded` columns. This is -enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. +read them as :term:`dictionary encoded` columns. This is +enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. If the files were written with Arrow C++ and the ``store_schema`` was activated, then the original Arrow schema will be automatically read and will override this setting. @@ -174,7 +174,7 @@ The :func:`arrow::WriteTable` function writes an entire .. note:: - Column compression is off by default in C++. See :ref:`below ` + Column compression is off by default in C++. See :ref:`below ` for how to choose a compression codec in the writer properties. To write out data batch-by-batch, use :class:`arrow::FileWriter`. @@ -191,9 +191,9 @@ StreamWriter The :class:`StreamWriter` allows for Parquet files to be written using standard C++ output operators, similar to reading with the :class:`StreamReader` -class. This type-safe approach also ensures that rows are written without -omitting fields and allows for new row groups to be created automatically -(after certain volume of data) or explicitly by using the :type:`EndRowGroup` +class. This type-safe approach also ensures that rows are written without +omitting fields and allows for new row groups to be created automatically +(after certain volume of data) or explicitly by using the :type:`EndRowGroup` stream modifier. Exceptions are used to signal errors. A :class:`ParquetException` is @@ -266,20 +266,20 @@ group that takes precedent over the ``chunk_size`` passed in the write methods. You can set the version of Parquet to write with ``version``, which determines which logical types are available. In addition, you can set the data page version with ``data_page_version``. It's V1 by default; setting to V2 will allow more -optimal compression (skipping compressing pages where there isn't a space +optimal compression (skipping compressing pages where there isn't a space benefit), but not all readers support this data page version. -Compression is off by default, but to get the most out of Parquet, you should -also choose a compression codec. You can choose one for the whole file or +Compression is off by default, but to get the most out of Parquet, you should +also choose a compression codec. You can choose one for the whole file or choose one for individual columns. If you choose a mix, the file-level option -will apply to columns that don't have a specific compression codec. See +will apply to columns that don't have a specific compression codec. See :class:`::arrow::Compression` for options. -Column data encodings can likewise be applied at the file-level or at the -column level. By default, the writer will attempt to dictionary encode all +Column data encodings can likewise be applied at the file-level or at the +column level. By default, the writer will attempt to dictionary encode all supported columns, unless the dictionary grows too large. This behavior can be changed at file-level or at the column level with ``disable_dictionary()``. -When not using dictionary encoding, it will fallback to the encoding set for +When not using dictionary encoding, it will fallback to the encoding set for the column or the overall file; by default ``Encoding::PLAIN``, but this can be changed with ``encoding()``. @@ -559,7 +559,7 @@ Encryption Parquet C++ implements all features specified in the `encryption specification `__, -except for encryption of column index and bloom filter modules. +except for encryption of column index and bloom filter modules. More specifically, Parquet C++ supports: diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst index b28a9fc1e13a5..d98a2acde6620 100644 --- a/docs/source/cpp/tables.rst +++ b/docs/source/cpp/tables.rst @@ -81,13 +81,13 @@ and computation functions, possibly incremental. :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. -Record batches can be sent between implementations, such as via +Record batches can be sent between implementations, such as via :ref:`IPC ` or -via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and +via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and chunked arrays, on the other hand, are concepts in the C++ implementation, not in the Arrow format itself, so they aren't directly portable. -However, a table can be converted to and built from a sequence of record +However, a table can be converted to and built from a sequence of record batches easily without needing to copy the underlying array buffers. A table can be streamed as an arbitrary number of record batches using a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of diff --git a/docs/source/cpp/threading.rst b/docs/source/cpp/threading.rst index 24ad25b5a028a..4a1a65ffe012d 100644 --- a/docs/source/cpp/threading.rst +++ b/docs/source/cpp/threading.rst @@ -99,4 +99,4 @@ Arrow C++ uses :class:`arrow::Future` to communicate results between threads. T an :class:`arrow::Future` will be created when an operation needs to perform some kind of long running task that will block for some period of time. :class:`arrow::Future` objects are mainly meant for internal use and any method that returns an -:class:`arrow::Future` will usually have a synchronous variant as well. \ No newline at end of file +:class:`arrow::Future` will usually have a synchronous variant as well. diff --git a/docs/source/cpp/tutorials/compute_tutorial.rst b/docs/source/cpp/tutorials/compute_tutorial.rst index bcb87e6a8f992..a650865d75ce4 100644 --- a/docs/source/cpp/tutorials/compute_tutorial.rst +++ b/docs/source/cpp/tutorials/compute_tutorial.rst @@ -34,7 +34,7 @@ functionality to: 3. Search for a value in a column -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -49,16 +49,16 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. ``A main()`` is needed to glue things together. 3. We need data to play with. - + Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/compute_example.cc :language: cpp @@ -340,4 +340,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Compute Example) :end-before: (Doc section: Compute Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/datasets_tutorial.rst b/docs/source/cpp/tutorials/datasets_tutorial.rst index 285fc24d8d599..f60e1e52170ae 100644 --- a/docs/source/cpp/tutorials/datasets_tutorial.rst +++ b/docs/source/cpp/tutorials/datasets_tutorial.rst @@ -33,7 +33,7 @@ file on disk. In this article, you will: 2. write out a partitioned dataset from a Table. -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need data on disk to play with. @@ -58,8 +58,8 @@ Before running some computations, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc :language: cpp @@ -206,7 +206,7 @@ Build Dataset using Factory ^^^^^^^^^^^^^^^^^^^^^^^^^^^ With a :class:`dataset::FileSystemDatasetFactory` set up, we can actually build our -:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just +:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just like with an :class:`ArrayBuilder` back in the basic tutorial: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -228,14 +228,14 @@ dataset, and print those out, along with some small info: Move Dataset into Table ^^^^^^^^^^^^^^^^^^^^^^^ -One way we can do something with :class:`Datasets ` is getting -them into a :class:`Table`, where we can do anything we’ve learned we can do to -:class:`Tables
` to that :class:`Table`. +One way we can do something with :class:`Datasets ` is getting +them into a :class:`Table`, where we can do anything we’ve learned we can do to +:class:`Tables
` to that :class:`Table`. .. seealso:: :doc:`/cpp/streaming_execution` for execution that avoids manifesting the entire dataset in memory. -In order to move a :class:`Dataset’s ` contents into a :class:`Table`, -we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. +In order to move a :class:`Dataset’s ` contents into a :class:`Table`, +we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. First, we get a :class:`dataset::ScannerBuilder` from the :class:`dataset::Dataset`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -305,7 +305,7 @@ Create Scanner for Moving Table Data The process for writing a :class:`dataset::Dataset`, once a source of data is available, is similar to the reverse of reading it. Before, we used a :class:`dataset::Scanner` in order to scan into a :class:`Table` – now, we need one to read out of our -:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` +:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` based on our :class:`TableBatchReader`, then use that Builder to build a :class:`dataset::Scanner`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -343,7 +343,7 @@ Arrow, so we’ll write back out to that: :start-after: (Doc section: Write Format) :end-before: (Doc section: Write Format) -Configure FileSystemDatasetWriteOptions +Configure FileSystemDatasetWriteOptions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In order to write to disk, we need some configuration. We’ll do so via @@ -435,11 +435,11 @@ tutorials. With that, you’ve read and written partitioned datasets! This method, with some configuration, will work for any supported dataset format. For an example of such a dataset, the NYC Taxi dataset is a well-known -one, which you can find `here `_. +one, which you can find `here `_. Now you can get larger-than-memory data mapped for use! Which means that now we have to be able to process this data without -pulling it all into memory at once. For this, try Acero. +pulling it all into memory at once. For this, try Acero. .. seealso:: :doc:`/cpp/streaming_execution` for more information on Acero. @@ -450,4 +450,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Dataset Example) :end-before: (Doc section: Dataset Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/io_tutorial.rst b/docs/source/cpp/tutorials/io_tutorial.rst index f981c94b83e32..309f10a350aa3 100644 --- a/docs/source/cpp/tutorials/io_tutorial.rst +++ b/docs/source/cpp/tutorials/io_tutorial.rst @@ -33,7 +33,7 @@ the start to end of an application. In this article, you will: 3. Read a Parquet file into a :class:`Table` and write it back out afterwards -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before writing out some file I/O, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need files to play with. @@ -58,8 +58,8 @@ Before writing out some file I/O, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -I/O functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +I/O functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc :language: cpp @@ -156,8 +156,8 @@ Opening an Arrow file Reader ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An :class:`io::ReadableFile` is too generic to offer all functionality to read an Arrow file. -We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements -all the logic needed to read an Arrow file with correct formatting. We get one through +We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements +all the logic needed to read an Arrow file with correct formatting. We get one through :func:`ipc::RecordBatchFileReader::Open`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -294,8 +294,8 @@ Write a CSV File from Table CSV writing to :class:`Table` looks exactly like IPC writing to :class:`RecordBatch`, except with our :class:`Table`, and using :func:`ipc::RecordBatchWriter::WriteTable` instead of -:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- -we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target +:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- +we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target a file, use our :class:`Table’s
` :class:`Schema`, and then write the :class:`Table`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -358,7 +358,7 @@ even though we used :func:`io::ReadableFile::Open`. Note that we pass our Reading a Parquet File to Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a +With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a :class:`Table`, except we must pass the :class:`Table` by reference instead of outputting to it: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -401,4 +401,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: File I/O) :end-before: (Doc section: File I/O) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/developers/continuous_integration/index.rst b/docs/source/developers/continuous_integration/index.rst index f988b5ab69d50..cfca14e10e48c 100644 --- a/docs/source/developers/continuous_integration/index.rst +++ b/docs/source/developers/continuous_integration/index.rst @@ -27,4 +27,4 @@ Continuous Integration overview docker archery - crossbow \ No newline at end of file + crossbow diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 5fab745679e93..040a046c5153d 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -67,7 +67,7 @@ On Alpine Linux: gcc \ ninja \ make - + On Fedora Linux: .. code-block:: shell @@ -99,7 +99,7 @@ On macOS, you can use `Homebrew `_: With `vcpkg `_: .. code-block:: shell - + git clone https://github.com/apache/arrow.git cd arrow vcpkg install \ @@ -362,7 +362,7 @@ boolean flags to ``cmake``. * ``-DARROW_GCS=ON``: Build Arrow with GCS support (requires the GCloud SDK for C++) * ``-DARROW_HDFS=ON``: Arrow integration with libhdfs for accessing the Hadoop Filesystem -* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default +* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default * ``-DARROW_JSON=ON``: JSON reader module * ``-DARROW_MIMALLOC=ON``: Build the Arrow mimalloc-based allocator * ``-DARROW_ORC=ON``: Arrow integration with Apache ORC @@ -375,7 +375,7 @@ boolean flags to ``cmake``. instead. * ``-DARROW_S3=ON``: Support for Amazon S3-compatible filesystems * ``-DARROW_SUBSTRAIT=ON``: Build with support for Substrait -* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 +* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` is ``ON`` * ``-DARROW_WITH_UTF8PROC=ON``: Build with support for Unicode properties using the utf8proc library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` @@ -472,7 +472,7 @@ The build system supports a number of third-party dependencies * ``c-ares``: a dependency of gRPC * ``gflags``: for command line utilities (formerly Googleflags) * ``GLOG``: for logging - * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires + * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires system cURL and can use the ``BUNDLED`` method described below * ``gRPC``: for remote procedure calls * ``GTest``: Googletest, for testing diff --git a/docs/source/developers/cpp/windows.rst b/docs/source/developers/cpp/windows.rst index 251a45325fe0b..60ac949e81663 100644 --- a/docs/source/developers/cpp/windows.rst +++ b/docs/source/developers/cpp/windows.rst @@ -379,9 +379,9 @@ Downloading the Timezone Database ================================= To run some of the compute unit tests on Windows, the IANA timezone database -and the Windows timezone mapping need to be downloaded first. See +and the Windows timezone mapping need to be downloaded first. See :ref:`download-timezone-database` for download instructions. To set a non-default -path for the timezone database while running the unit tests, set the +path for the timezone database while running the unit tests, set the ``ARROW_TIMEZONE_DATABASE`` environment variable. Replicating Appveyor Builds diff --git a/docs/source/developers/guide/architectural_overview.rst b/docs/source/developers/guide/architectural_overview.rst index 58e05c85f457e..085a814453c84 100644 --- a/docs/source/developers/guide/architectural_overview.rst +++ b/docs/source/developers/guide/architectural_overview.rst @@ -29,8 +29,8 @@ Architectural Overview ********************** -A general overview of Apache Arrow project can be found on the -`front page `_ and in the +A general overview of Apache Arrow project can be found on the +`front page `_ and in the `Apache Arrow Overview `_. You can also have a look at the `Frequently Asked Questions `_. diff --git a/docs/source/developers/guide/communication.rst b/docs/source/developers/guide/communication.rst index a8659f83ac04d..749c94f9419b2 100644 --- a/docs/source/developers/guide/communication.rst +++ b/docs/source/developers/guide/communication.rst @@ -27,7 +27,7 @@ .. _communication: ************* -Communication +Communication ************* **About the contributors** @@ -50,7 +50,7 @@ tags ([C++], [R], [Ruby] etc.) so it gets noticed by the right people. Where to get help 👋 ==================== -For any question you may have or problems you are facing you can write to +For any question you may have or problems you are facing you can write to user or development :ref:`mailing_list` or you can create an issue on :ref:`github`. Also use GitHub to search through the issues, report bugs and create feature requests or proposals. diff --git a/docs/source/developers/guide/documentation.rst b/docs/source/developers/guide/documentation.rst index 3bb3bebef5098..8f9d7311e765f 100644 --- a/docs/source/developers/guide/documentation.rst +++ b/docs/source/developers/guide/documentation.rst @@ -49,7 +49,7 @@ documentation itself, you can search for an issue in GitHub. Documentation improvements are also a great way to gain some experience with our submission and review process without -requiring a lot of local development environment setup. +requiring a lot of local development environment setup. .. note:: Many documentation-only changes can be made directly in the @@ -114,4 +114,3 @@ library. Source folder includes: **Cookbooks** have their own repository ``_ and can be separately cloned and built. - diff --git a/docs/source/developers/guide/index.rst b/docs/source/developers/guide/index.rst index 353c8332ff0b5..0ed27a0ddc54e 100644 --- a/docs/source/developers/guide/index.rst +++ b/docs/source/developers/guide/index.rst @@ -83,17 +83,17 @@ of adding a basic feature. the installation of third-party packages, depending on which build options and components you enable. The C++ build guide has suggestions for commonly encountered issues - you can find it - :ref:`here `. + :ref:`here `. Anytime you are stuck, feel free to reach out via appropriate :ref:`communication` channel. - See a short description about the building process of + See a short description about the building process of :ref:`PyArrow or the R package` or go straight to detailed instructions on how to build one of Arrow libraries in the `documentation `_ . - + #. **Run the tests** - + We should run the tests to check if everything is working correctly. For example, you can run the tests from a terminal for Python @@ -155,7 +155,7 @@ There are lots of ways to contribute to the project besides writing code! * Improving the **documentation** is a great way to start contributing! For more information visit :ref:`documentation` section of the guide. -* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems +* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems and completing different tasks using Apache Arrow. They are also a great way to start contributing. For more information visit `How to contribute to Apache Arrow Cookbook `_ diff --git a/docs/source/developers/guide/resources.rst b/docs/source/developers/guide/resources.rst index f350f469af403..b5905af65499b 100644 --- a/docs/source/developers/guide/resources.rst +++ b/docs/source/developers/guide/resources.rst @@ -78,7 +78,7 @@ Reproducible examples: - `Tidyverse: Make a reprex `_ - `Craft Minimal Bug Reports by Matthew Rocklin `_ -Recommended references +Recommended references ---------------------- - Slatkin, Brett, *Effective Python: 90 Specific Ways to Write Better Python*, Addison-Wesley Professional, 2019 diff --git a/docs/source/developers/guide/step_by_step/finding_issues.rst b/docs/source/developers/guide/step_by_step/finding_issues.rst index 390c56a81c73f..a76b15e917e9a 100644 --- a/docs/source/developers/guide/step_by_step/finding_issues.rst +++ b/docs/source/developers/guide/step_by_step/finding_issues.rst @@ -65,7 +65,7 @@ person who triaged the ticket expected it to be. Don't hesitate to write that in the comments. .. note:: - + When you find a GitHub issue you would like to work on, please mention your interest in the comment section of that issue; that way we will know you are working on it. diff --git a/docs/source/developers/guide/step_by_step/set_up.rst b/docs/source/developers/guide/step_by_step/set_up.rst index 60b472637badb..9a2177568d6f5 100644 --- a/docs/source/developers/guide/step_by_step/set_up.rst +++ b/docs/source/developers/guide/step_by_step/set_up.rst @@ -60,7 +60,7 @@ a username and password each time you execute a git command. RStudio project and will create a ``.Rproj`` file in the root directory. For this reason it is *highly recommended* to clone the repository using the command line or a Git client. - + Get the source code =================== diff --git a/docs/source/developers/guide/step_by_step/styling.rst b/docs/source/developers/guide/step_by_step/styling.rst index bb428b0b6ab40..c155acb389512 100644 --- a/docs/source/developers/guide/step_by_step/styling.rst +++ b/docs/source/developers/guide/step_by_step/styling.rst @@ -59,4 +59,4 @@ check your code and will stop the commit process, described in the following section, if there are any errors. - `Pre-commit installation instructions `_ -- `Pre-commit hooks `_ \ No newline at end of file +- `Pre-commit hooks `_ diff --git a/docs/source/developers/guide/tutorials/index.rst b/docs/source/developers/guide/tutorials/index.rst index dcefab23230f9..5f44231afc9c2 100644 --- a/docs/source/developers/guide/tutorials/index.rst +++ b/docs/source/developers/guide/tutorials/index.rst @@ -25,4 +25,4 @@ Tutorials :maxdepth: 1 python_tutorial - r_tutorial \ No newline at end of file + r_tutorial diff --git a/docs/source/developers/guide/tutorials/python_tutorial.rst b/docs/source/developers/guide/tutorials/python_tutorial.rst index 7f004160b0e75..c12c4489aee95 100644 --- a/docs/source/developers/guide/tutorials/python_tutorial.rst +++ b/docs/source/developers/guide/tutorials/python_tutorial.rst @@ -137,7 +137,7 @@ function is defined in the ``compute.py`` file. After examining the ``compute.py`` file we can see that together with ``_compute.pyx`` the functions from C++ get wrapped into Python. -We will define the new feature at the end of the ``compute.py`` file. +We will define the new feature at the end of the ``compute.py`` file. Lets run some code in the Python console from ``arrow/python`` directory in order to learn more about ``pc.min_max``. @@ -147,10 +147,10 @@ directory in order to learn more about ``pc.min_max``. $ cd python $ python - Python 3.9.7 (default, Oct 22 2021, 13:24:00) + Python 3.9.7 (default, Oct 22 2021, 13:24:00) [Clang 13.0.0 (clang-1300.0.29.3)] on darwin Type "help", "copyright", "credits" or "license" for more information. - + We have entered into the Python console from the shell and we can do some research: @@ -278,7 +278,7 @@ options for the ``pc.min_max`` function we can finish the work. return pa.scalar([('min-', min_t), ('max+', max_t)], type=ty) .. TODO seealso - .. For more information about the Arrow codebase visit + .. For more information about the Arrow codebase visit .. :ref:``. (link to working on the Arrow codebase section) Adding a test @@ -303,24 +303,24 @@ a specific unit test, pass in the test name to the ``-k`` parameter. .. code:: console $ cd python - $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max + $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max ======================== test session starts ========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items / 203 deselected / 1 selected + collected 204 items / 203 deselected / 1 selected pyarrow/tests/test_compute.py . [100%] ======================== 1 passed, 203 deselected in 0.16s ============ - - $ python -m pytest pyarrow/tests/test_compute.py + + $ python -m pytest pyarrow/tests/test_compute.py ======================== test session starts =========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items + collected 204 items pyarrow/tests/test_compute.py ................................... [ 46%] ................................................. [100%] @@ -339,7 +339,7 @@ utility called `Archery ` to check if code is in line with PEP 8 style guide. .. code:: console - + $ archery lint --python --fix INFO:archery:Running Python formatter (autopep8) INFO:archery:Running Python linter (flake8) @@ -430,7 +430,7 @@ to the branch history): $ git commit -am "Adding a new compute feature for tutorial purposes" [ARROW-14977 170ef85be] Adding a new compute feature for tutorial purposes 2 files changed, 51 insertions(+) - + We can use ``git log`` to check the history of commits: @@ -448,12 +448,12 @@ We can use ``git log`` to check the history of commits: Date: Sun Dec 5 15:19:46 2021 +0900 ARROW-14981: [CI][Docs] Upload built documents - + We can use this in release process instead of building on release manager's local environment. - + Closes #11856 from kou/ci-docs-upload - + Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei ... @@ -478,10 +478,10 @@ called ``origin``. Writing objects: 100% (7/7), 1.19 KiB | 1.19 MiB/s, done. Total 7 (delta 6), reused 0 (delta 0), pack-reused 0 remote: Resolving deltas: 100% (6/6), completed with 6 local objects. - remote: + remote: remote: Create a pull request for 'ARROW-14977' on GitHub by visiting: remote: https://github.com/AlenkaF/arrow/pull/new/ARROW-14977 - remote: + remote: To https://github.com/AlenkaF/arrow.git * [new branch] ARROW-14977 -> ARROW-14977 @@ -490,7 +490,7 @@ to create a Pull Request. On the GitHub Arrow page (main or forked) we will see a yellow notice bar with a note that we made recent pushes to the branch ARROW-14977. That’s great, now we can make the Pull Request -by clicking on **Compare & pull request**. +by clicking on **Compare & pull request**. .. figure:: ../../images/python_tutorial_github_pr_notice.jpeg :scale: 50 % @@ -527,5 +527,5 @@ the code, comment, resolve conversations and so on. The Pull Request we made can be viewed `here `_. .. seealso:: - + For more information about Pull Request workflow see :ref:`pr_lifecycle`. diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index c059ff676efb2..82053e901186c 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -350,7 +350,7 @@ Arrow repository, and update the following settings: * To enable debugging JNI-based modules like ``dataset``, activate specific profiles in the Maven tab under "Profiles". Ensure the profiles ``arrow-c-data``, ``arrow-jni``, ``generate-libs-cdata-all-os``, - ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the + ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the IDE can build them and enable debugging. You may not need to update all of these settings if you build/test with the diff --git a/docs/source/developers/overview.rst b/docs/source/developers/overview.rst index c7bc4273313bc..5a18b1e4eb8db 100644 --- a/docs/source/developers/overview.rst +++ b/docs/source/developers/overview.rst @@ -75,7 +75,7 @@ checklist for using ``git``: locally, for example if additional commits have been made by a colleague. By using ``--force-with-lease`` instead of ``--force``, you ensure those commits are not overwritten and can fetch those changes if desired. - + .. dropdown:: Setting rebase to be default :animate: fade-in-slide-down :class-container: sd-shadow-none @@ -202,4 +202,3 @@ Implementations that do not intend to implement cross endian support: For other libraries, a discussion to gather consensus on the mailing-list should be had before submitting PRs. - diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index e7431ce0fb7b9..0b3a83dc5aabe 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -80,10 +80,10 @@ Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly a # Delete the local tag for RC1 or later git tag -d apache-arrow- - + # Setup gpg agent for signing artifacts source dev/release/setup-gpg-agent.sh - + # Curate the release # The end of the generated report shows the JIRA tickets with wrong version number assigned. archery release curate @@ -180,7 +180,7 @@ Create the Release Candidate branch from the updated maintenance branch # Start from the updated maintenance branch. git checkout maint-X.Y.Z - + # The following script will create a branch for the Release Candidate, # place the necessary commits updating the version number and then create a git tag # on OSX use gnu-sed with homebrew: brew install gnu-sed (and export to $PATH) @@ -188,7 +188,7 @@ Create the Release Candidate branch from the updated maintenance branch # starts at 0 and increments every time the Release Candidate is burned # so for the first RC this would be: dev/release/01-prepare.sh 4.0.0 5.0.0 0 dev/release/01-prepare.sh - + # Push the release tag (for RC1 or later the --force flag is required) git push -u apache apache-arrow- # Push the release candidate branch in order to trigger verification jobs later @@ -201,23 +201,23 @@ Build source and binaries and submit them # Build the source release tarball and create Pull Request with verification tasks dev/release/02-source.sh - + # Submit binary tasks using crossbow, the command will output the crossbow build id dev/release/03-binary-submit.sh - + # Wait for the crossbow jobs to finish archery crossbow status - + # Download the produced binaries # This will download packages to a directory called packages/release--rc dev/release/04-binary-download.sh - + # Sign and upload the binaries # # On macOS the only way I could get this to work was running "echo "UPDATESTARTUPTTY" | gpg-connect-agent" before running this comment # otherwise I got errors referencing "ioctl" errors. dev/release/05-binary-upload.sh - + # Sign and upload the Java artifacts # # Note that you need to press the "Close" button manually by Web interface diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst index 8c301b44a3c42..afd220db6010d 100644 --- a/docs/source/developers/release_verification.rst +++ b/docs/source/developers/release_verification.rst @@ -55,7 +55,7 @@ and test the result on their own platform in order to cast a +1 vote. # this will create and automatically clean up a temporary directory for the verification environment and will run the source verification TEST_DEFAULT=0 TEST_SOURCE=1 verify-release-candidate.sh $VERSION $RC_NUM - + # to verify only certain implementations use the TEST_DEFAULT=0 and TEST_* variables # here are a couple of examples, but see the source code for the available options TEST_DEFAULT=0 TEST_CPP=1 verify-release-candidate.sh $VERSION $RC_NUM # only C++ tests diff --git a/docs/source/developers/reviewing.rst b/docs/source/developers/reviewing.rst index b6e0c1f4023bd..1550d6aa7ce61 100644 --- a/docs/source/developers/reviewing.rst +++ b/docs/source/developers/reviewing.rst @@ -260,14 +260,14 @@ Social aspects Labelling ========= -While reviewing PRs, we should try to identify whether the corresponding issue +While reviewing PRs, we should try to identify whether the corresponding issue needs to be marked with one or both of the following issue labels: * **Critical Fix**: The change fixes either: (a) a security vulnerability; (b) a bug that causes incorrect or invalid data to be produced; or (c) a bug that causes a crash (while the API contract is upheld). This is intended to mark fixes to issues that may affect users without their - knowledge. For this reason, fixing bugs that cause errors don't count, since + knowledge. For this reason, fixing bugs that cause errors don't count, since those bugs are usually obvious. Bugs that cause crashes are considered critical because they are a possible vector of Denial-of-Service attacks. * **Breaking Change**: The change breaks backwards compatibility in a public API. @@ -275,7 +275,7 @@ needs to be marked with one or both of the following issue labels: compatibility, except for the few places where we do guarantee ABI compatibility (such as C Data Interface). Experimental APIs are *not* exempt from this; they are just more likely to be associated with this tag. - + Breaking changes and critical fixes are separate: breaking changes alter the API contract, while critical fixes make the implementation align with the existing API contract. For example, fixing a bug that caused a Parquet reader diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 03095aa2e9356..67f77f53f012b 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -64,7 +64,7 @@ structures should be wrapped in capsules. Capsules avoid invalid access by attaching a name to the pointer and avoid memory leaks by attaching a destructor. Thus, they are much safer than passing pointers as integers. -`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing +`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing consumers to verify that the capsule contains the expected kind of data. To make sure Arrow structures are recognized, the following names must be used: @@ -133,8 +133,8 @@ Arrays and record batches (contiguous tables) can implement the method Export the object as a pair of ArrowSchema and ArrowArray structures. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -152,8 +152,8 @@ Tables / DataFrames and streams can implement the method ``__arrow_c_stream__``. Export the object as an ArrowArrayStream. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -192,7 +192,7 @@ schema transformations. Protocol Typehints ------------------ -The following typehints can be copied into your library to annotate that a +The following typehints can be copied into your library to annotate that a function accepts an object implementing one of these protocols. .. code-block:: python @@ -248,7 +248,7 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for } free(schema); } - + PyObject* ExportArrowSchemaPyCapsule() { struct ArrowSchema* schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); @@ -270,9 +270,9 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for ) if schema.release != NULL: schema.release(schema) - + free(schema) - + cdef object export_arrow_schema_py_capsule(): cdef ArrowSchema* schema = malloc(sizeof(ArrowSchema)) # It's recommended to immediately wrap the struct in a capsule, so @@ -305,7 +305,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: c #include - + // If the capsule is not an ArrowSchema, will return NULL and set an exception. struct ArrowSchema* GetArrowSchemaPyCapsule(PyObject* capsule) { return PyCapsule_GetPointer(capsule, "arrow_schema"); @@ -316,7 +316,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: cython cimport cpython - + cdef ArrowSchema* get_arrow_schema_py_capsule(object capsule) except NULL: return cpython.PyCapsule_GetPointer(capsule, 'arrow_schema') @@ -429,7 +429,7 @@ implementing the DataFrame Interchange Protocol. Comparison to ``__arrow_array__`` protocol ------------------------------------------ -The :ref:`arrow_array_protocol` protocol is a dunder method that +The :ref:`arrow_array_protocol` protocol is a dunder method that defines how PyArrow should import an object as an Arrow array. Unlike this protocol, it is specific to PyArrow and isn't used by other libraries. It is -also limited to arrays and does not support schemas, tabular structures, or streams. \ No newline at end of file +also limited to arrays and does not support schemas, tabular structures, or streams. diff --git a/docs/source/format/Glossary.rst b/docs/source/format/Glossary.rst index 3f2f118a95d6d..11c19c5fa70e9 100644 --- a/docs/source/format/Glossary.rst +++ b/docs/source/format/Glossary.rst @@ -211,7 +211,7 @@ Glossary its bindings, and Go). .. image:: ../cpp/tables-versus-record-batches.svg - :alt: A graphical representation of an Arrow Table and a + :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. .. seealso:: :term:`chunked array`, :term:`record batch` diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 1a9b1b97f07ee..c800255687796 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -501,14 +501,14 @@ integration testing actually tests. There are two types of integration test cases: the ones populated on the fly by the data generator in the Archery utility, and *gold* files that exist -in the `arrow-testing ` +in the `arrow-testing ` repository. Data Generator Tests ~~~~~~~~~~~~~~~~~~~~ This is the high-level description of the cases which are generated and -tested using the ``archery integration`` command (see ``get_generated_json_files`` +tested using the ``archery integration`` command (see ``get_generated_json_files`` in ``datagen.py``): * Primitive Types @@ -549,7 +549,7 @@ Gold File Integration Tests Pre-generated json and arrow IPC files (both file and stream format) exist in the `arrow-testing `__ repository in the ``data/arrow-ipc-stream/integration`` directory. These serve as -*gold* files that are assumed to be correct for use in testing. They are +*gold* files that are assumed to be correct for use in testing. They are referenced by ``runner.py`` in the code for the :ref:`Archery ` utility. Below are the test cases which are covered by them: @@ -563,7 +563,7 @@ utility. Below are the test cases which are covered by them: + intervals + maps + nested types (list, struct) - + primitives + + primitives + primitive with no batches + primitive with zero length batches diff --git a/docs/source/java/algorithm.rst b/docs/source/java/algorithm.rst index 316fd38fa0990..06ed32bd48cf7 100644 --- a/docs/source/java/algorithm.rst +++ b/docs/source/java/algorithm.rst @@ -20,12 +20,12 @@ Java Algorithms Arrow's Java library provides algorithms for some commonly-used functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm`` -package of the ``algorithm`` module. +package of the ``algorithm`` module. Comparing Vector Elements ------------------------- -Comparing vector elements is the basic for many algorithms. Vector +Comparing vector elements is the basic for many algorithms. Vector elements can be compared in one of the two ways: 1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``. @@ -36,30 +36,30 @@ interface. and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``. We provide default implementations to compare vector elements. However, users can also define ways -for customized comparisons. +for customized comparisons. Vector Element Search --------------------- -A search algorithm tries to find a particular value in a vector. When successful, a vector index is +A search algorithm tries to find a particular value in a vector. When successful, a vector index is returned; otherwise, a ``-1`` is returned. The following search algorithms are provided: -1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is +1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``. -2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. +2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. However, it is only applicable to sorted vectors. To get a sorted vector, one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``. 3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search -for a value. To make this process faster, one can split the vector into multiple partitions, and perform the +for a value. To make this process faster, one can split the vector into multiple partitions, and perform the search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``. -4. **Range search**: for many scenarios, there can be multiple matching values in the vector. +4. **Range search**: for many scenarios, there can be multiple matching values in the vector. If the vector is sorted, the matching values reside in a contiguous region in the vector. The -range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. +range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``. Vector Sorting @@ -72,19 +72,19 @@ classified into the following categories: 1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original vector, without creating any new vector. So it just returns the original vector after the sorting operations. Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place -sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. +sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. 2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead, it copies vector elements to a new vector in sorted order, and returns the new vector. -We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` +We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter`` -for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. +for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. 3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer vector, which correspond to indices of vector elements in sorted order. With the index vector, one can easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k``th -smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, -which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. +smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, +which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. Other Algorithms ---------------- diff --git a/docs/source/java/flight.rst b/docs/source/java/flight.rst index e009998be4f4e..6d26583aeefa6 100644 --- a/docs/source/java/flight.rst +++ b/docs/source/java/flight.rst @@ -184,7 +184,7 @@ Handshake-based authentication can be enabled by implementing ``ServerAuthHandler``. Authentication consists of two parts: on initial client connection, the server and client authentication implementations can perform any negotiation needed. The client authentication -handler then provides a token that will be attached to future calls. +handler then provides a token that will be attached to future calls. The client send data to be validated through ``ClientAuthHandler.authenticate`` The server validate data received through ``ServerAuthHandler.authenticate``. diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst index 0ace2185983a9..cc8822247b007 100644 --- a/docs/source/java/flight_sql_jdbc_driver.rst +++ b/docs/source/java/flight_sql_jdbc_driver.rst @@ -169,8 +169,8 @@ when using the JDBC Driver Manager to connect. When supplying using the Properties object, values should *not* be URI-encoded. Parameters specified by the URI supercede parameters supplied by the -Properties object. When calling the `user/password overload of -DriverManager#getConnection() +Properties object. When calling the `user/password overload of +DriverManager#getConnection() `_, the username and password supplied on the URI supercede the username and password arguments to the function call. diff --git a/docs/source/java/memory.rst b/docs/source/java/memory.rst index 036befa148692..8014a27444ac9 100644 --- a/docs/source/java/memory.rst +++ b/docs/source/java/memory.rst @@ -20,7 +20,7 @@ Memory Management ================= The memory modules contain all the functionality that Arrow uses to allocate and deallocate memory. This document is divided in two parts: -The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. +The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. .. contents:: @@ -39,7 +39,7 @@ Getting Started Arrow's memory management is built around the needs of the columnar format and using off-heap memory. Arrow Java has its own independent implementation. It does not wrap the C++ implementation, although the framework is flexible enough -to be used with memory allocated in C++ that is used by Java code. +to be used with memory allocated in C++ that is used by Java code. Arrow provides multiple modules: the core interfaces, and implementations of the interfaces. Users need the core interfaces, and exactly one of the implementations. @@ -67,9 +67,9 @@ Why Arrow Uses Direct Memory BufferAllocator --------------- -The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). -As the name suggests, it can allocate new buffers associated with itself, but it can also -handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for +The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). +As the name suggests, it can allocate new buffers associated with itself, but it can also +handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for memory allocated in C++ and shared with Java using the C-Data Interface. In the code below it performs an allocation: .. code-block:: Java @@ -100,21 +100,21 @@ memory from a child allocator, those allocations are also reflected in all paren effectively sets the program-wide memory limit, and serves as the master bookkeeper for all memory allocations. Child allocators are not strictly required, but can help better organize code. For instance, a lower memory limit can -be set for a particular section of code. The child allocator can be closed when that section completes, -at which point it checks that that section didn't leak any memory. +be set for a particular section of code. The child allocator can be closed when that section completes, +at which point it checks that that section didn't leak any memory. Child allocators can also be named, which makes it easier to tell where an ArrowBuf came from during debugging. Reference counting ------------------ -Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers -deterministically, we use manual reference counting instead of the garbage collector. +Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers +deterministically, we use manual reference counting instead of the garbage collector. This simply means that each buffer has a counter keeping track of the number of references to the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used. In Arrow, each ArrowBuf has an associated `ReferenceManager`_ that tracks the reference count. You can retrieve -it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, -and `ReferenceManager.retain`_ to increment it. +it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, +and `ReferenceManager.retain`_ to increment it. Of course, this is tedious and error-prone, so instead of directly working with buffers, we typically use higher-level APIs like ValueVector. Such classes generally implement Closeable/AutoCloseable and will automatically @@ -289,7 +289,7 @@ Finally, enabling the ``TRACE`` logging level will automatically provide this st | at (#8:1) Sometimes, explicitly passing allocators around is difficult. For example, it -can be hard to pass around extra state, like an allocator, through layers of +can be hard to pass around extra state, like an allocator, through layers of existing application or framework code. A global or singleton allocator instance can be useful here, though it should not be your first choice. @@ -370,7 +370,7 @@ Arrow’s memory model is based on the following basic concepts: leaks. - The same physical memory can be shared by multiple allocators and the allocator must provide an accounting paradigm for this purpose. - + Reserving Memory ---------------- @@ -384,17 +384,17 @@ Arrow provides two different ways to reserve memory: - ``AllocationReservation`` via BufferAllocator.newReservation(): Allows a short-term preallocation strategy so that a particular subsystem can ensure future memory is available to support a - particular request. - + particular request. + Reference Counting Details -------------------------- -Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. -A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, +Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. +A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, a ``BufferAllocator`` and one or more individual ``ArrowBuf``\ s -All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination -share the same reference count and either all will be valid or all will be invalid. +All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination +share the same reference count and either all will be valid or all will be invalid. For simplicity of accounting, we treat that memory as being used by one of the BufferAllocators associated with the memory. When that allocator releases its claim on that memory, the memory ownership is then moved to @@ -411,7 +411,7 @@ There are several Allocator types in Arrow Java: - ``ChildAllocator`` - A child allocator that derives from the root allocator Many BufferAllocators can reference the same piece of physical memory at the same -time. It is the AllocationManager’s responsibility to ensure that in this situation, +time. It is the AllocationManager’s responsibility to ensure that in this situation, all memory is accurately accounted for from the Root’s perspective and also to ensure that the memory is correctly released once all BufferAllocators have stopped using that memory. diff --git a/docs/source/java/quickstartguide.rst b/docs/source/java/quickstartguide.rst index e358681c57830..a71ddc5b5e55f 100644 --- a/docs/source/java/quickstartguide.rst +++ b/docs/source/java/quickstartguide.rst @@ -313,4 +313,4 @@ Example: Read the dataset from the previous example from an Arrow IPC file (rand More examples available at `Arrow Java Cookbook`_. -.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java \ No newline at end of file +.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java diff --git a/docs/source/java/vector.rst b/docs/source/java/vector.rst index abbbd1a236d6d..1c3e123cf50fb 100644 --- a/docs/source/java/vector.rst +++ b/docs/source/java/vector.rst @@ -226,7 +226,7 @@ A :class:`ListVector` is a vector that holds a list of values for each index. Wo For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid. .. code-block:: Java - + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); ListVector listVector = ListVector.empty("vector", allocator)) { UnionListWriter writer = listVector.getWriter(); @@ -240,7 +240,7 @@ For example, the code below shows how to build a :class:`ListVector` of int's us writer.endList(); } listVector.setValueCount(10); - } + } :class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 928c607d139ce..ae48578a1bd61 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -52,10 +52,10 @@ Aggregations Cumulative Functions -------------------- -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions do not detect overflow. They are also available in an overflow-checking variant, suffixed ``_checked``, which throws an ``ArrowInvalid`` exception when overflow is detected. diff --git a/docs/source/python/api/substrait.rst b/docs/source/python/api/substrait.rst index 66e88fcd279ae..1556be9dbd011 100644 --- a/docs/source/python/api/substrait.rst +++ b/docs/source/python/api/substrait.rst @@ -50,4 +50,4 @@ Utility .. autosummary:: :toctree: ../generated/ - get_supported_functions \ No newline at end of file + get_supported_functions diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index c02059a4f8faa..ce3dfabb0e689 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -23,7 +23,7 @@ Compute Functions ================= Arrow supports logical compute operations over inputs of possibly -varying types. +varying types. The standard compute operations are provided by the :mod:`pyarrow.compute` module and can be used directly:: @@ -91,7 +91,7 @@ Grouped Aggregations ==================== PyArrow supports grouped aggregations over :class:`pyarrow.Table` through the -:meth:`pyarrow.Table.group_by` method. +:meth:`pyarrow.Table.group_by` method. The method will return a grouping declaration to which the hash aggregation functions can be applied:: @@ -300,7 +300,7 @@ Filtering by Expressions :class:`.Table` and :class:`.Dataset` can both be filtered using a boolean :class:`.Expression`. -The expression can be built starting from a +The expression can be built starting from a :func:`pyarrow.compute.field`. Comparisons and transformations can then be applied to one or more fields to build the filter expression you care about. @@ -325,7 +325,7 @@ in column ``"nums"`` by the ``bit_wise_and`` operation equals ``0``. Only the numbers where the last bit was ``0`` will return a ``0`` as the result of ``num & 1`` and as all numbers where the last bit is ``0`` are multiples of ``2`` we will be filtering for the even numbers only. - + Once we have our filter, we can provide it to the :meth:`.Table.filter` method to filter our table only for the matching rows: @@ -392,7 +392,7 @@ User-Defined Functions PyArrow allows defining and registering custom compute functions. These functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package) -using their registered function name. +using their registered function name. UDF support is limited to scalar functions. A scalar function is a function which executes elementwise operations on arrays or scalars. In general, the output of a @@ -441,7 +441,7 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun function_docs, input_types, output_type) - + The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of @@ -497,9 +497,9 @@ the GCD of one column with the scalar value 30. We will be re-using the category: [["A","B","C","D"]] Note that ``ds.field('')._call(...)`` returns a :func:`pyarrow.compute.Expression`. -The arguments passed to this function call are expressions, not scalar values +The arguments passed to this function call are expressions, not scalar values (notice the difference between :func:`pyarrow.scalar` and :func:`pyarrow.compute.scalar`, -the latter produces an expression). +the latter produces an expression). This expression is evaluated when the projection operator executes it. Projection Expressions diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index daab36f9a7be9..00469fd57becf 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -575,28 +575,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: @@ -611,35 +611,35 @@ of file size. Arrow's file writer provides sensible defaults for group sizing in Configuring files open during a write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When writing data to the disk, there are a few parameters that can be +When writing data to the disk, there are a few parameters that can be important to optimize the writes, such as the number of rows per file and the maximum number of open files allowed during the write. Set the maximum number of files opened with the ``max_open_files`` parameter of :meth:`write_dataset`. -If ``max_open_files`` is set greater than 0 then this will limit the maximum +If ``max_open_files`` is set greater than 0 then this will limit the maximum number of files that can be left open. This only applies to writing partitioned datasets, where rows are dispatched to the appropriate file depending on their partition values. If an attempt is made to open too many files then the least recently used file will be closed. If this setting is set too low you may end up fragmenting your data into many small files. -If your process is concurrently using other file handlers, either with a -dataset scanner or otherwise, you may hit a system file handler limit. For +If your process is concurrently using other file handlers, either with a +dataset scanner or otherwise, you may hit a system file handler limit. For example, if you are scanning a dataset with 300 files and writing out to 900 files, the total of 1200 files may be over a system limit. (On Linux, this might be a "Too Many Open Files" error.) You can either reduce this ``max_open_files`` setting or increase the file handler limit on your system. The default value is 900 which allows some number of files -to be open by the scanner before hitting the default Linux limit of 1024. +to be open by the scanner before hitting the default Linux limit of 1024. -Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. +Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. Set the maximum number of rows written in each file with the ``max_rows_per_files`` parameter of :meth:`write_dataset`. -If ``max_rows_per_file`` is set greater than 0 then this will limit how many +If ``max_rows_per_file`` is set greater than 0 then this will limit how many rows are placed in any single file. Otherwise there will be no limit and one file will be created in each output directory unless files need to be closed to respect ``max_open_files``. This setting is the primary way to control file size. @@ -653,22 +653,22 @@ Configuring rows per group during a write The volume of data written to the disk per each group can be configured. This configuration includes a lower and an upper bound. -The minimum number of rows required to form a row group is +The minimum number of rows required to form a row group is defined with the ``min_rows_per_group`` parameter of :meth:`write_dataset`. .. note:: - If ``min_rows_per_group`` is set greater than 0 then this will cause the - dataset writer to batch incoming data and only write the row groups to the - disk when sufficient rows have accumulated. The final row group size may be - less than this value if other options such as ``max_open_files`` or + If ``min_rows_per_group`` is set greater than 0 then this will cause the + dataset writer to batch incoming data and only write the row groups to the + disk when sufficient rows have accumulated. The final row group size may be + less than this value if other options such as ``max_open_files`` or ``max_rows_per_file`` force smaller row group sizes. The maximum number of rows allowed per group is defined with the ``max_rows_per_group`` parameter of :meth:`write_dataset`. -If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split -up large incoming batches into multiple row groups. If this value is set then -``min_rows_per_group`` should also be set or else you may end up with very small +If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split +up large incoming batches into multiple row groups. If this value is set then +``min_rows_per_group`` should also be set or else you may end up with very small row groups (e.g. if the incoming row group size is just barely larger than this value). Row groups are built into the Parquet and IPC/Feather formats but don't affect JSON or CSV. @@ -719,7 +719,7 @@ Customizing & inspecting written files By default the dataset API will create files named "part-i.format" where "i" is a integer generated during the write and "format" is the file format specified in the write_dataset call. For simple datasets it may be possible to know which files will be created but for -larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used +larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used to supply a visitor that will be called as each file is created: .. ipython:: python diff --git a/docs/source/python/dlpack.rst b/docs/source/python/dlpack.rst index f612ebabde5c9..024c2800e1107 100644 --- a/docs/source/python/dlpack.rst +++ b/docs/source/python/dlpack.rst @@ -90,4 +90,4 @@ Convert a PyArrow CPU array to PyTorch tensor: >>> import torch >>> torch.from_dlpack(array) - tensor([2, 0, 2, 4]) + tensor([2, 0, 2, 4]) diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 5309250351d8e..22f983a60c349 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -233,7 +233,7 @@ generate a credentials file in the default location:: To connect to a public bucket without using any credentials, you must pass ``anonymous=True`` to :class:`GcsFileSystem`. Otherwise, the filesystem -will report ``Couldn't resolve host name`` since there are different host +will report ``Couldn't resolve host name`` since there are different host names for authenticated and public access. Example showing how you can read contents from a GCS bucket:: @@ -314,7 +314,7 @@ For example:: # using this to read a partitioned dataset import pyarrow.dataset as ds ds.dataset("data/", filesystem=fs) - + Similarly for Azure Blob Storage:: import adlfs diff --git a/docs/source/python/getstarted.rst b/docs/source/python/getstarted.rst index d38fcadab288f..42e415c40b835 100644 --- a/docs/source/python/getstarted.rst +++ b/docs/source/python/getstarted.rst @@ -37,7 +37,7 @@ in tabular data. Arrow also provides support for various formats to get those tabular data in and out of disk and networks. Most commonly used formats are -Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). +Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). Creating Arrays and Tables -------------------------- @@ -63,7 +63,7 @@ in tabular data when attached to a column name birthdays_table = pa.table([days, months, years], names=["days", "months", "years"]) - + birthdays_table See :ref:`data` for more details. @@ -75,7 +75,7 @@ Once you have tabular data, Arrow provides out of the box the features to save and restore that data for common formats like Parquet: -.. ipython:: python +.. ipython:: python import pyarrow.parquet as pq @@ -92,14 +92,14 @@ data will be as quick as possible reloaded_birthdays Saving and loading back data in arrow is usually done through -:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), +:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), :ref:`CSV ` or :ref:`Line-Delimited JSON ` formats. Performing Computations ----------------------- Arrow ships with a bunch of compute functions that can be applied -to its arrays and tables, so through the compute functions +to its arrays and tables, so through the compute functions it's possible to apply transformations to the data .. ipython:: python @@ -122,7 +122,7 @@ smaller chunks import pyarrow.dataset as ds - ds.write_dataset(birthdays_table, "savedir", format="parquet", + ds.write_dataset(birthdays_table, "savedir", format="parquet", partitioning=ds.partitioning( pa.schema([birthdays_table.schema.field("years")]) )) @@ -151,8 +151,8 @@ how to project them, etc., refer to :ref:`dataset` documentation. Continuing from here -------------------- -For digging further into Arrow, you might want to read the -:doc:`PyArrow Documentation <./index>` itself or the +For digging further into Arrow, you might want to read the +:doc:`PyArrow Documentation <./index>` itself or the `Arrow Python Cookbook `_ diff --git a/docs/source/python/getting_involved.rst b/docs/source/python/getting_involved.rst index 7b3bcf2ac527a..9fda3c7c78488 100644 --- a/docs/source/python/getting_involved.rst +++ b/docs/source/python/getting_involved.rst @@ -54,7 +54,7 @@ used as foundations to build easier to use entities. exposed to the user are declared. In some cases, those files might directly import the entities from inner implementation if they want to expose it as is without modification. -* The ``lib.pyx`` file is where the majority of the core C++ libarrow +* The ``lib.pyx`` file is where the majority of the core C++ libarrow capabilities are exposed to Python. Most of the implementation of this module relies on included ``*.pxi`` files where the specific pieces are built. While being exposed to Python as ``pyarrow.lib`` its content @@ -73,4 +73,4 @@ used as foundations to build easier to use entities. PyArrow is also based on PyArrow C++, dedicated pieces of code that live in ``python/pyarrow/src/arrow/python`` directory and provide the low level code for capabilities like converting to and from numpy or pandas and the classes - that allow to use Python objects and callbacks in C++. \ No newline at end of file + that allow to use Python objects and callbacks in C++. diff --git a/docs/source/python/integration/python_r.rst b/docs/source/python/integration/python_r.rst index 20627c3782d3c..ec5dfc366fdf9 100644 --- a/docs/source/python/integration/python_r.rst +++ b/docs/source/python/integration/python_r.rst @@ -29,7 +29,7 @@ marshaling and unmarshaling data. The article takes for granted that you have a ``Python`` environment with ``pyarrow`` correctly installed and an ``R`` environment with - ``arrow`` library correctly installed. + ``arrow`` library correctly installed. See `Python Install Instructions `_ and `R Install instructions `_ for further details. @@ -52,7 +52,7 @@ We could save such a function in a ``addthree.R`` file so that we can make it available for reuse. Once the ``addthree.R`` file is created we can invoke any of its functions -from Python using the +from Python using the `rpy2 `_ library which enables a R runtime within the Python interpreter. @@ -91,12 +91,12 @@ to access the ``R`` function and print the expected result: .. code-block:: bash - $ python addthree.py + $ python addthree.py 6 If instead of passing around basic data types we want to pass around Arrow Arrays, we can do so relying on the -`rpy2-arrow `_ +`rpy2-arrow `_ module which implements ``rpy2`` support for Arrow types. ``rpy2-arrow`` can be installed through ``pip``: @@ -189,7 +189,7 @@ Invoking the ``addthree.R`` script will print the outcome of adding .. code-block:: bash - $ R --silent -f addthree.R + $ R --silent -f addthree.R Array [ @@ -219,7 +219,7 @@ necessary to import an Arrow Array in R from the C Data interface. That work will be done by the ``addthree_cdata`` function which invokes the ``addthree`` function once the Array is imported. -Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the +Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the ``addthree`` functions: .. code-block:: R @@ -261,7 +261,7 @@ Our ``addthree.py`` will thus become: # Import the pyarrow module that provides access to the C Data interface from pyarrow.cffi import ffi as arrow_c - # Allocate structures where we will export the Array data + # Allocate structures where we will export the Array data # and the Array schema. They will be released when we exit the with block. with arrow_c.new("struct ArrowArray*") as c_array, \ arrow_c.new("struct ArrowSchema*") as c_schema: @@ -274,7 +274,7 @@ Our ``addthree.py`` will thus become: array.type._export_to_c(c_schema_ptr) # Invoke the R addthree_cdata function passing the references - # to the array and schema C Data structures. + # to the array and schema C Data structures. # Those references are passed as strings as R doesn't have # native support for 64bit integers, so the integers are # converted to their string representation for R to convert it back. @@ -289,19 +289,19 @@ Our ``addthree.py`` will thus become: # Once the returned array is exported to a C Data infrastructure # we can import it back into pyarrow using Array._import_from_c py_array = pyarrow.Array._import_from_c(c_array_ptr, c_schema_ptr) - + print("RESULT", py_array) Running the newly changed ``addthree.py`` will now print the Array resulting -from adding ``3`` to all the elements of the original +from adding ``3`` to all the elements of the original ``pyarrow.array((1, 2, 3))`` array: .. code-block:: bash - $ python addthree.py + $ python addthree.py R[write to console]: Attaching package: ‘arrow’ RESULT [ 4, 5, 6 - ] \ No newline at end of file + ] diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index 27cd14a68853d..f55e8f8bc5dc3 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -76,12 +76,12 @@ this one can be created with :func:`~pyarrow.ipc.new_stream`: .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_stream(sink, batch.schema) as writer: for i in range(5): writer.write_batch(batch) -Here we used an in-memory Arrow buffer stream (``sink``), +Here we used an in-memory Arrow buffer stream (``sink``), but this could have been a socket or some other IO sink. When creating the ``StreamWriter``, we pass the schema, since the schema @@ -102,7 +102,7 @@ convenience function ``pyarrow.ipc.open_stream``: with pa.ipc.open_stream(buf) as reader: schema = reader.schema batches = [b for b in reader] - + schema len(batches) @@ -126,7 +126,7 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_file(sink, batch.schema) as writer: for i in range(10): writer.write_batch(batch) @@ -164,7 +164,7 @@ DataFrame output: with pa.ipc.open_file(buf) as reader: df = reader.read_pandas() - + df[:5] Efficiently Writing and Reading Arrow Data diff --git a/docs/source/python/json.rst b/docs/source/python/json.rst index 99ecbc19a1230..eff6135d895a7 100644 --- a/docs/source/python/json.rst +++ b/docs/source/python/json.rst @@ -21,7 +21,7 @@ Reading JSON files ================== -Arrow supports reading columnar data from line-delimited JSON files. +Arrow supports reading columnar data from line-delimited JSON files. In this context, a JSON file consists of multiple JSON objects, one per line, representing individual data rows. For example, this file represents two rows of data with four columns "a", "b", "c", "d": diff --git a/docs/source/python/orc.rst b/docs/source/python/orc.rst index bfa68fc34d895..76c293d742010 100644 --- a/docs/source/python/orc.rst +++ b/docs/source/python/orc.rst @@ -112,7 +112,7 @@ control various settings when writing an ORC file. * ``file_version``, the ORC format version to use. ``'0.11'`` ensures compatibility with older readers, while ``'0.12'`` is the newer one. -* ``stripe_size``, to control the approximate size of data within a column +* ``stripe_size``, to control the approximate size of data within a column stripe. This currently defaults to 64MB. See the :func:`~pyarrow.orc.write_table()` docstring for more details. diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index d4717897660b6..029ed4f1a3e15 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -32,7 +32,7 @@ performance data IO. Apache Arrow is an ideal in-memory transport layer for data that is being read or written with Parquet files. We have been concurrently developing the `C++ -implementation of +implementation of Apache Parquet `_, which includes a native, multithreaded C++ adapter to and from in-memory Arrow data. PyArrow includes Python bindings to this code, which thus enables reading diff --git a/docs/source/python/timestamps.rst b/docs/source/python/timestamps.rst index 64a2a354dddef..cecbd5b595bc7 100644 --- a/docs/source/python/timestamps.rst +++ b/docs/source/python/timestamps.rst @@ -51,8 +51,8 @@ This implies a few things when round-tripping timestamps: #. Timezone information is lost (all timestamps that result from converting from spark to arrow/pandas are "time zone naive"). #. Timestamps are truncated to microseconds. -#. The session time zone might have unintuitive impacts on - translation of timestamp values. +#. The session time zone might have unintuitive impacts on + translation of timestamp values. Spark to Pandas (through Apache Arrow) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -62,8 +62,8 @@ The following cases assume the Spark configuration :: - >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], - ... 'aware': [Timestamp(year=2019, month=1, day=1, + >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], + ... 'aware': [Timestamp(year=2019, month=1, day=1, ... nanosecond=500, tz=timezone(timedelta(hours=-8)))]}) >>> pdf naive aware @@ -77,7 +77,7 @@ The following cases assume the Spark configuration +-------------------+-------------------+ |2019-01-01 00:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + Note that conversion of the aware timestamp is shifted to reflect the time assuming UTC (it represents the same instant in time). For naive timestamps, Spark treats them as being in the system local @@ -129,7 +129,7 @@ session time zone is still PST: |2019-01-01 00:00:00|2019-01-01 00:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas() naive aware 0 2019-01-01 2019-01-01 @@ -141,7 +141,7 @@ session time zone is still PST: aware 1 non-null datetime64[ns] dtypes: datetime64[ns](2) memory usage: 96.0 bytes - + Notice that, in addition to being a "time zone naive" timestamp, the 'aware' value will now differ when converting to an epoch offset. Spark does the conversion by first converting to the session time zone (or system local time zone if @@ -158,9 +158,9 @@ time: >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 -8.0 -The same type of conversion happens with the data frame converted while -the session time zone was UTC. In this case both naive and aware -represent different instants in time (the naive instant is due to +The same type of conversion happens with the data frame converted while +the session time zone was UTC. In this case both naive and aware +represent different instants in time (the naive instant is due to the change in session time zone between creating data frames): :: @@ -179,9 +179,9 @@ the change in session time zone between creating data frames): Note that the surprising shift for aware doesn't happen when the session time zone is UTC (but the timestamps still become "time zone naive"): - + :: - + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") >>> pst_df.show() +-------------------+-------------------+ @@ -189,7 +189,7 @@ still become "time zone naive"): +-------------------+-------------------+ |2019-01-01 08:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas()['aware'][0] Timestamp('2019-01-01 08:00:00') >>> pdf['aware'][0] From 250291500b6a7d5d934901acef708cef2eb1dc08 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 1 May 2024 14:39:35 +0800 Subject: [PATCH 019/105] GH-41463: [C++] Skip TestConcurrentFillFromScalar for platforms without threading support (#41461) ### Rationale for this change See #41463 and https://github.com/apache/arrow/pull/40237#issuecomment-2084577090 ### What changes are included in this PR? Skip test for platforms that have no threading support. ### Are these changes tested? Change is test. ### Are there any user-facing changes? None. * GitHub Issue: #41463 Authored-by: Ruoxi Sun Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/array/array_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index af64908b59582..7e25ad61fa2ea 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -827,6 +827,9 @@ TEST_F(TestArray, TestFillFromScalar) { // GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of the same // scalar instance. TEST_F(TestArray, TestConcurrentFillFromScalar) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { ARROW_SCOPED_TRACE("type = ", type->ToString()); for (auto seed : {0u, 0xdeadbeef, 42u}) { From 22f88fa4a8f5ac7250f1845aace5a78d20006ef2 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 1 May 2024 00:12:37 -0800 Subject: [PATCH 020/105] GH-41410: [C++][FS][Azure][Docs] Add AzureFileSystem to Filesystems API reference (#41411) ### Rationale for this change See https://github.com/apache/arrow/issues/41410. ### What changes are included in this PR? Just changes to filesystem.rst. ### Are these changes tested? Yes, locally. ### Are there any user-facing changes? These are those changes. * GitHub Issue: #41410 Authored-by: Bryce Mecum Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.h | 17 ++++++++--------- docs/source/cpp/api/filesystem.rst | 9 +++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 667b4e372ae59..b71a5ae73b2e9 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -141,18 +141,14 @@ struct ARROW_EXPORT AzureOptions { /// /// 1. abfs[s]://[:\@]\.blob.core.windows.net /// [/\[/\]] - /// 2. abfs[s]://\[:\]@\.dfs.core.windows.net - /// [/path] + /// 2. abfs[s]://\[:\]\@\.dfs.core.windows.net[/path] /// 3. abfs[s]://[\]@]\[\<:port\>] /// [/\[/path]] /// 4. abfs[s]://[\]@]\[/path] /// - /// 1. and 2. are compatible with the Azure Data Lake Storage Gen2 URIs: - /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri - /// - /// 3. is for Azure Blob Storage compatible service including Azurite. - /// - /// 4. is a shorter version of 1. and 2. + /// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs + /// [1], (3) is for Azure Blob Storage compatible service including Azurite, + /// and (4) is a shorter version of (1) and (2). /// /// Note that there is no difference between abfs and abfss. HTTPS is /// used with abfs by default. You can force to use HTTP by specifying @@ -178,6 +174,9 @@ struct ARROW_EXPORT AzureOptions { /// AzureOptions::ConfigureClientSecretCredential() is called. /// * client_secret: You must specify "tenant_id" and "client_id" /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// + /// [1]: + /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri static Result FromUri(const Uri& uri, std::string* out_path); static Result FromUri(const std::string& uri, std::string* out_path); @@ -226,7 +225,7 @@ struct ARROW_EXPORT AzureOptions { /// overwriting. /// - When you use the ListBlobs operation without specifying a delimiter, the results /// include both directories and blobs. If you choose to use a delimiter, use only a -/// forward slash (/) -- the only supported delimiter. +/// forward slash (/) \--- the only supported delimiter. /// - If you use the DeleteBlob API to delete a directory, that directory is deleted only /// if it's empty. This means that you can't use the Blob API delete directories /// recursively. diff --git a/docs/source/cpp/api/filesystem.rst b/docs/source/cpp/api/filesystem.rst index 02b12668327f2..599e9fedb60f9 100644 --- a/docs/source/cpp/api/filesystem.rst +++ b/docs/source/cpp/api/filesystem.rst @@ -97,3 +97,12 @@ Google Cloud Storage filesystem .. doxygenclass:: arrow::fs::GcsFileSystem :members: + +Azure filesystem +---------------- + +.. doxygenstruct:: arrow::fs::AzureOptions + :members: + +.. doxygenclass:: arrow::fs::AzureFileSystem + :members: From 281122c018df86601ca675f3941751ddc3a89b3d Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Thu, 2 May 2024 00:18:12 +0800 Subject: [PATCH 021/105] GH-41306: [C++] Check to avoid copying when NullBitmapBuffer is Null (#41452) ### Rationale for this change This PR addresses a bug with the `FixedSizeBinary` type where it does not cast to a `Binary` type after being sliced. When slicing occurs, the offset is modified. If the resulting sliced data structure does not contain any `null` values, the Null Bitmap Buffer may be set to `null`. Currently, when a `Cast` operation is attempted on such a data structure, the code erroneously tries to access the Null Bitmap Buffer even when it is `null`. This leads to an `EXC_BAD_ACCESS` error. This PR implements a fix to prevent this erroneous behavior by adding checks before accessing the Null Bitmap Buffer. ### What changes are included in this PR? - Add a null check for the Null Bitmap Buffer when casting from `FixedSizeBinary` to `Binary` to prevent access violations if the buffer is null. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes (Pyarrow side) * GitHub Issue: #41306 Authored-by: Hyunseok Seo Signed-off-by: Weston Pace --- .../compute/kernels/scalar_cast_string.cc | 13 ++++++--- .../arrow/compute/kernels/scalar_cast_test.cc | 27 +++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 3a8352a9b870f..dc3fe29a3dfae 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -340,10 +340,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou if (input.offset == output->offset) { output->buffers[0] = input.GetBuffer(0); } else { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); + // When the offsets are different (e.g., due to slice operation), we need to check if + // the null bitmap buffer is not null before copying it. The null bitmap buffer can be + // null if the input array value does not contain any null value. + if (input.buffers[0].data != NULLPTR) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } } // This buffer is preallocated diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index af62b4da2caa5..a6d7f6097b59b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2196,6 +2196,33 @@ TEST(Cast, BinaryOrStringToFixedSizeBinary) { } } +TEST(Cast, FixedSizeBinaryToBinaryOrString) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + + CheckCast(valid_input, ArrayFromJSON(out_type, R"(["foo", null, "bar", "baz", + "quu"])")); + + auto empty_input = ArrayFromJSON(fixed_size_binary(3), "[]"); + CheckCast(empty_input, ArrayFromJSON(out_type, "[]")); + } +} + +TEST(Cast, FixedSizeBinaryToBinaryOrStringWithSlice) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + auto sliced = valid_input->Slice(1, 3); + CheckCast(sliced, ArrayFromJSON(out_type, R"([null, "bar", "baz"])")); + + auto valid_input_without_null = ArrayFromJSON(fixed_size_binary(3), R"(["foo", "bar", + "baz", "quu"])"); + auto sliced_without_null = valid_input_without_null->Slice(1, 3); + CheckCast(sliced_without_null, ArrayFromJSON(out_type, R"(["bar", "baz", "quu"])")); + } +} + TEST(Cast, IntToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), From cc78c7a9bf17ceba7d538b30ddda008daeb1db85 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 May 2024 05:54:20 +0900 Subject: [PATCH 022/105] MINOR: [JS] Bump memfs from 4.8.2 to 4.9.2 in /js (#41482) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [memfs](https://github.com/streamich/memfs) from 4.8.2 to 4.9.2.
Release notes

Sourced from memfs's releases.

v4.9.2

4.9.2 (2024-04-30)

Bug Fixes

v4.9.1

4.9.1 (2024-04-27)

Bug Fixes

  • 🐛 use latest json-pack implementation (de54ab5)
Changelog

Sourced from memfs's changelog.

4.9.2 (2024-04-30)

Bug Fixes

4.9.1 (2024-04-27)

Bug Fixes

  • 🐛 use latest json-pack implementation (de54ab5)

4.9.0 (2024-04-27)

Features

  • 🎸 define .scan() CRUD method (921e05d)
  • 🎸 implement .scan() in Node.js CRUD (3d973b7)
  • 🎸 implement .scan() method for FSA CRUD (a148fb8)
Commits
  • 0cc081d chore(release): 4.9.2 [skip ci]
  • a474a47 Merge pull request #1031 from streamich/bump-utils
  • 32cc4da fix: 🐛 bump json-pack
  • eea3b42 fix: 🐛 bump @​jsonjoy.com/util package
  • 7a38617 chore(deps): update peaceiris/actions-gh-pages action to v4 (#1027)
  • b198f40 ci: 🎡 add mirror to Gitlab workflow
  • 4619f16 chore(release): 4.9.1 [skip ci]
  • e5461ae Merge pull request #1028 from streamich/dependencies
  • 0dfd7bb docs: ✏️ describe memfs() helper in docs
  • 0509f15 chore: 🤖 remove /src/json-joy/ folder
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=memfs&package-manager=npm_and_yarn&previous-version=4.8.2&new-version=4.9.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 53 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/js/package.json b/js/package.json index fee6b342dbd13..7def7986490d6 100644 --- a/js/package.json +++ b/js/package.json @@ -99,7 +99,7 @@ "ix": "5.0.0", "jest": "29.7.0", "jest-silent-reporter": "0.5.0", - "memfs": "4.8.2", + "memfs": "4.9.2", "mkdirp": "3.0.1", "multistream": "4.1.0", "regenerator-runtime": "0.14.1", diff --git a/js/yarn.lock b/js/yarn.lock index b74e4543d9d4e..9daed1af9dd69 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -936,6 +936,26 @@ "@jridgewell/resolve-uri" "^3.1.0" "@jridgewell/sourcemap-codec" "^1.4.14" +"@jsonjoy.com/base64@^1.1.1": + version "1.1.1" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/base64/-/base64-1.1.1.tgz#a717fd8840f7bad49c7fe66cc65db8bcfc4c4dc5" + integrity sha512-LnFjVChaGY8cZVMwAIMjvA1XwQjZ/zIXHyh28IyJkyNkzof4Dkm1+KN9UIm3lHhREH4vs7XwZ0NpkZKnwOtEfg== + +"@jsonjoy.com/json-pack@^1.0.3": + version "1.0.3" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/json-pack/-/json-pack-1.0.3.tgz#a68cbe3ccfd85d26cd763e4175fe90c9ee383d33" + integrity sha512-Q0SPAdmK6s5Fe3e1kcNvwNyk6e2+CxM8XZdGbf4abZG7nUO05KSie3/iX29loTBuY+75uVP6RixDSPVpotfzmQ== + dependencies: + "@jsonjoy.com/base64" "^1.1.1" + "@jsonjoy.com/util" "^1.1.2" + hyperdyperid "^1.2.0" + thingies "^1.20.0" + +"@jsonjoy.com/util@^1.1.2": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/util/-/util-1.1.2.tgz#5072c27ecdb16d1ed7a2d125a1d0ed8aba01d652" + integrity sha512-HOGa9wtE6LEz2I5mMQ2pMSjth85PmD71kPbsecs02nEUq3/Kw0wRK3gmZn5BCEB8mFLXByqPxjHgApoMwIPMKQ== + "@nodelib/fs.scandir@2.1.5": version "2.1.5" resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5" @@ -4018,6 +4038,11 @@ human-signals@^2.1.0: resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== +hyperdyperid@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/hyperdyperid/-/hyperdyperid-1.2.0.tgz#59668d323ada92228d2a869d3e474d5a33b69e6b" + integrity sha512-Y93lCzHYgGWdrJ66yIktxiaGULYc6oGiABxhcO5AufBeOyoIdZF7bIfLaOrbM0iGIOXQQgxxRrFEnb+Y6w1n4A== + ignore@^5.2.0, ignore@^5.2.4, ignore@^5.3.1: version "5.3.1" resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.3.1.tgz#5073e554cd42c5b33b394375f538b8593e34d4ef" @@ -5165,11 +5190,14 @@ matchdep@^2.0.0: resolve "^1.4.0" stack-trace "0.0.10" -memfs@4.8.2: - version "4.8.2" - resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.8.2.tgz#9bb7c3e43647348451082557f05fb170b7442949" - integrity sha512-j4WKth315edViMBGkHW6NTF0QBjsTrcRDmYNcGsPq+ozMEyCCCIlX2d2mJ5wuh6iHvJ3FevUrr48v58YRqVdYg== +memfs@4.9.2: + version "4.9.2" + resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.9.2.tgz#42e7b48207268dad8c9c48ea5d4952c5d3840433" + integrity sha512-f16coDZlTG1jskq3mxarwB+fGRrd0uXWt+o1WIhRfOwbXQZqUDsTVxQBFK9JjRQHblg8eAG2JSbprDXKjc7ijQ== dependencies: + "@jsonjoy.com/json-pack" "^1.0.3" + "@jsonjoy.com/util" "^1.1.2" + sonic-forest "^1.0.0" tslib "^2.0.0" memoizee@0.4.X: @@ -6386,6 +6414,13 @@ snapdragon@^0.8.1: source-map-resolve "^0.5.0" use "^3.1.0" +sonic-forest@^1.0.0: + version "1.0.2" + resolved "https://registry.yarnpkg.com/sonic-forest/-/sonic-forest-1.0.2.tgz#d80aa621d1cffe75a606ca44789ccff30f5b9ce6" + integrity sha512-2rICdwIJi5kVlehMUVtJeHn3ohh5YZV4pDv0P0c1M11cRz/gXNViItpM94HQwfvnXuzybpqK0LZJgTa3lEwtAw== + dependencies: + tree-dump "^1.0.0" + source-map-resolve@^0.5.0: version "0.5.3" resolved "https://registry.yarnpkg.com/source-map-resolve/-/source-map-resolve-0.5.3.tgz#190866bece7553e1f8f267a2ee82c606b5509a1a" @@ -6755,6 +6790,11 @@ textextensions@^3.2.0: resolved "https://registry.yarnpkg.com/textextensions/-/textextensions-3.3.0.tgz#03530d5287b86773c08b77458589148870cc71d3" integrity sha512-mk82dS8eRABNbeVJrEiN5/UMSCliINAuz8mkUwH4SwslkNP//gbEzlWNS5au0z5Dpx40SQxzqZevZkn+WYJ9Dw== +thingies@^1.20.0: + version "1.21.0" + resolved "https://registry.yarnpkg.com/thingies/-/thingies-1.21.0.tgz#e80fbe58fd6fdaaab8fad9b67bd0a5c943c445c1" + integrity sha512-hsqsJsFMsV+aD4s3CWKk85ep/3I9XzYV/IXaSouJMYIoDlgyi11cBhsqYe9/geRfB0YIikBQg6raRaM+nIMP9g== + through2-filter@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/through2-filter/-/through2-filter-3.0.0.tgz#700e786df2367c2c88cd8aa5be4cf9c1e7831254" @@ -6866,6 +6906,11 @@ totalist@^3.0.0: resolved "https://registry.yarnpkg.com/totalist/-/totalist-3.0.1.tgz#ba3a3d600c915b1a97872348f79c127475f6acf8" integrity sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ== +tree-dump@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/tree-dump/-/tree-dump-1.0.1.tgz#b448758da7495580e6b7830d6b7834fca4c45b96" + integrity sha512-WCkcRBVPSlHHq1dc/px9iOfqklvzCbdRwvlNfxGZsrHqf6aZttfPrd7DJTt6oR10dwUfpFFQeVTkPbBIZxX/YA== + trim-newlines@^4.0.2: version "4.1.1" resolved "https://registry.yarnpkg.com/trim-newlines/-/trim-newlines-4.1.1.tgz#28c88deb50ed10c7ba6dc2474421904a00139125" From 9ce7ab10fbb3937cdcb4800a791c06591523240b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 May 2024 05:55:57 +0900 Subject: [PATCH 023/105] MINOR: [JS] Bump rollup from 4.14.3 to 4.17.2 in /js (#41484) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [rollup](https://github.com/rollup/rollup) from 4.14.3 to 4.17.2.
Release notes

Sourced from rollup's releases.

v4.17.2

4.17.2

2024-04-30

Bug Fixes

  • Fix tree-shaking problems when using spread arguments (#5503)

Pull Requests

v4.17.1

4.17.1

2024-04-29

Bug Fixes

  • Prevent infinite recursions for certain constructor invocations (#5500)

Pull Requests

v4.17.0

4.17.0

2024-04-27

Features

  • Track function call arguments to optimize functions only called once or with the same literal values (re-release from 4.16.0) (#5483)

Bug Fixes

  • Reduce browser WASM size to a fraction by changing optimization settings (#5494)

Pull Requests

v4.16.4

... (truncated)

Changelog

Sourced from rollup's changelog.

4.17.2

2024-04-30

Bug Fixes

  • Fix tree-shaking problems when using spread arguments (#5503)

Pull Requests

4.17.1

2024-04-29

Bug Fixes

  • Prevent infinite recursions for certain constructor invocations (#5500)

Pull Requests

4.17.0

2024-04-27

Features

  • Track function call arguments to optimize functions only called once or with the same literal values (re-release from 4.16.0) (#5483)

Bug Fixes

  • Reduce browser WASM size to a fraction by changing optimization settings (#5494)

Pull Requests

4.16.4

2024-04-23

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rollup&package-manager=npm_and_yarn&previous-version=4.14.3&new-version=4.17.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 198 ++++++++++++++++++++++++------------------------ 2 files changed, 100 insertions(+), 100 deletions(-) diff --git a/js/package.json b/js/package.json index 7def7986490d6..8cab229521f79 100644 --- a/js/package.json +++ b/js/package.json @@ -103,7 +103,7 @@ "mkdirp": "3.0.1", "multistream": "4.1.0", "regenerator-runtime": "0.14.1", - "rollup": "4.14.3", + "rollup": "4.17.2", "rxjs": "7.8.1", "ts-jest": "29.1.2", "ts-node": "10.9.2", diff --git a/js/yarn.lock b/js/yarn.lock index 9daed1af9dd69..fe483ab8aca36 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1020,85 +1020,85 @@ estree-walker "^2.0.2" picomatch "^2.3.1" -"@rollup/rollup-android-arm-eabi@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.14.3.tgz#bddf05c3387d02fac04b6b86b3a779337edfed75" - integrity sha512-X9alQ3XM6I9IlSlmC8ddAvMSyG1WuHk5oUnXGw+yUBs3BFoTizmG1La/Gr8fVJvDWAq+zlYTZ9DBgrlKRVY06g== - -"@rollup/rollup-android-arm64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.14.3.tgz#b26bd09de58704c0a45e3375b76796f6eda825e4" - integrity sha512-eQK5JIi+POhFpzk+LnjKIy4Ks+pwJ+NXmPxOCSvOKSNRPONzKuUvWE+P9JxGZVxrtzm6BAYMaL50FFuPe0oWMQ== - -"@rollup/rollup-darwin-arm64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.14.3.tgz#c5f3fd1aa285b6d33dda6e3f3ca395f8c37fd5ca" - integrity sha512-Od4vE6f6CTT53yM1jgcLqNfItTsLt5zE46fdPaEmeFHvPs5SjZYlLpHrSiHEKR1+HdRfxuzXHjDOIxQyC3ptBA== - -"@rollup/rollup-darwin-x64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.14.3.tgz#8e4673734d7dc9d68f6d48e81246055cda0e840f" - integrity sha512-0IMAO21axJeNIrvS9lSe/PGthc8ZUS+zC53O0VhF5gMxfmcKAP4ESkKOCwEi6u2asUrt4mQv2rjY8QseIEb1aw== - -"@rollup/rollup-linux-arm-gnueabihf@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.14.3.tgz#53ed38eb13b58ababdb55a7f66f0538a7f85dcba" - integrity sha512-ge2DC7tHRHa3caVEoSbPRJpq7azhG+xYsd6u2MEnJ6XzPSzQsTKyXvh6iWjXRf7Rt9ykIUWHtl0Uz3T6yXPpKw== - -"@rollup/rollup-linux-arm-musleabihf@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.14.3.tgz#0706ee38330e267a5c9326956820f009cfb21fcd" - integrity sha512-ljcuiDI4V3ySuc7eSk4lQ9wU8J8r8KrOUvB2U+TtK0TiW6OFDmJ+DdIjjwZHIw9CNxzbmXY39wwpzYuFDwNXuw== - -"@rollup/rollup-linux-arm64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.14.3.tgz#426fce7b8b242ac5abd48a10a5020f5a468c6cb4" - integrity sha512-Eci2us9VTHm1eSyn5/eEpaC7eP/mp5n46gTRB3Aar3BgSvDQGJZuicyq6TsH4HngNBgVqC5sDYxOzTExSU+NjA== - -"@rollup/rollup-linux-arm64-musl@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.14.3.tgz#65bf944530d759b50d7ffd00dfbdf4125a43406f" - integrity sha512-UrBoMLCq4E92/LCqlh+blpqMz5h1tJttPIniwUgOFJyjWI1qrtrDhhpHPuFxULlUmjFHfloWdixtDhSxJt5iKw== - -"@rollup/rollup-linux-powerpc64le-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.14.3.tgz#494ba3b31095e9a45df9c3f646d21400fb631a95" - integrity sha512-5aRjvsS8q1nWN8AoRfrq5+9IflC3P1leMoy4r2WjXyFqf3qcqsxRCfxtZIV58tCxd+Yv7WELPcO9mY9aeQyAmw== - -"@rollup/rollup-linux-riscv64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.14.3.tgz#8b88ed0a40724cce04aa15374ebe5ba4092d679f" - integrity sha512-sk/Qh1j2/RJSX7FhEpJn8n0ndxy/uf0kI/9Zc4b1ELhqULVdTfN6HL31CDaTChiBAOgLcsJ1sgVZjWv8XNEsAQ== - -"@rollup/rollup-linux-s390x-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.14.3.tgz#09c9e5ec57a0f6ec3551272c860bb9a04b96d70f" - integrity sha512-jOO/PEaDitOmY9TgkxF/TQIjXySQe5KVYB57H/8LRP/ux0ZoO8cSHCX17asMSv3ruwslXW/TLBcxyaUzGRHcqg== - -"@rollup/rollup-linux-x64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.14.3.tgz#197f27fd481ad9c861021d5cbbf21793922a631c" - integrity sha512-8ybV4Xjy59xLMyWo3GCfEGqtKV5M5gCSrZlxkPGvEPCGDLNla7v48S662HSGwRd6/2cSneMQWiv+QzcttLrrOA== - -"@rollup/rollup-linux-x64-musl@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.14.3.tgz#5cc0522f4942f2df625e9bfb6fb02c6580ffbce6" - integrity sha512-s+xf1I46trOY10OqAtZ5Rm6lzHre/UiLA1J2uOhCFXWkbZrJRkYBPO6FhvGfHmdtQ3Bx793MNa7LvoWFAm93bg== - -"@rollup/rollup-win32-arm64-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.14.3.tgz#a648122389d23a7543b261fba082e65fefefe4f6" - integrity sha512-+4h2WrGOYsOumDQ5S2sYNyhVfrue+9tc9XcLWLh+Kw3UOxAvrfOrSMFon60KspcDdytkNDh7K2Vs6eMaYImAZg== - -"@rollup/rollup-win32-ia32-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.14.3.tgz#34727b5c7953c35fc6e1ae4f770ad3a2025f8e03" - integrity sha512-T1l7y/bCeL/kUwh9OD4PQT4aM7Bq43vX05htPJJ46RTI4r5KNt6qJRzAfNfM+OYMNEVBWQzR2Gyk+FXLZfogGw== - -"@rollup/rollup-win32-x64-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.14.3.tgz#5b2fb4d8cd44c05deef8a7b0e6deb9ccb8939d18" - integrity sha512-/BypzV0H1y1HzgYpxqRaXGBRqfodgoBBCcsrujT6QRcakDQdfU+Lq9PENPh5jB4I44YWq+0C2eHsHya+nZY1sA== +"@rollup/rollup-android-arm-eabi@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.17.2.tgz#1a32112822660ee104c5dd3a7c595e26100d4c2d" + integrity sha512-NM0jFxY8bB8QLkoKxIQeObCaDlJKewVlIEkuyYKm5An1tdVZ966w2+MPQ2l8LBZLjR+SgyV+nRkTIunzOYBMLQ== + +"@rollup/rollup-android-arm64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.17.2.tgz#5aeef206d65ff4db423f3a93f71af91b28662c5b" + integrity sha512-yeX/Usk7daNIVwkq2uGoq2BYJKZY1JfyLTaHO/jaiSwi/lsf8fTFoQW/n6IdAsx5tx+iotu2zCJwz8MxI6D/Bw== + +"@rollup/rollup-darwin-arm64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.17.2.tgz#6b66aaf003c70454c292cd5f0236ebdc6ffbdf1a" + integrity sha512-kcMLpE6uCwls023+kknm71ug7MZOrtXo+y5p/tsg6jltpDtgQY1Eq5sGfHcQfb+lfuKwhBmEURDga9N0ol4YPw== + +"@rollup/rollup-darwin-x64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.17.2.tgz#f64fc51ed12b19f883131ccbcea59fc68cbd6c0b" + integrity sha512-AtKwD0VEx0zWkL0ZjixEkp5tbNLzX+FCqGG1SvOu993HnSz4qDI6S4kGzubrEJAljpVkhRSlg5bzpV//E6ysTQ== + +"@rollup/rollup-linux-arm-gnueabihf@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.17.2.tgz#1a7641111be67c10111f7122d1e375d1226cbf14" + integrity sha512-3reX2fUHqN7sffBNqmEyMQVj/CKhIHZd4y631duy0hZqI8Qoqf6lTtmAKvJFYa6bhU95B1D0WgzHkmTg33In0A== + +"@rollup/rollup-linux-arm-musleabihf@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.17.2.tgz#c93fd632923e0fee25aacd2ae414288d0b7455bb" + integrity sha512-uSqpsp91mheRgw96xtyAGP9FW5ChctTFEoXP0r5FAzj/3ZRv3Uxjtc7taRQSaQM/q85KEKjKsZuiZM3GyUivRg== + +"@rollup/rollup-linux-arm64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.17.2.tgz#fa531425dd21d058a630947527b4612d9d0b4a4a" + integrity sha512-EMMPHkiCRtE8Wdk3Qhtciq6BndLtstqZIroHiiGzB3C5LDJmIZcSzVtLRbwuXuUft1Cnv+9fxuDtDxz3k3EW2A== + +"@rollup/rollup-linux-arm64-musl@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.17.2.tgz#8acc16f095ceea5854caf7b07e73f7d1802ac5af" + integrity sha512-NMPylUUZ1i0z/xJUIx6VUhISZDRT+uTWpBcjdv0/zkp7b/bQDF+NfnfdzuTiB1G6HTodgoFa93hp0O1xl+/UbA== + +"@rollup/rollup-linux-powerpc64le-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.17.2.tgz#94e69a8499b5cf368911b83a44bb230782aeb571" + integrity sha512-T19My13y8uYXPw/L/k0JYaX1fJKFT/PWdXiHr8mTbXWxjVF1t+8Xl31DgBBvEKclw+1b00Chg0hxE2O7bTG7GQ== + +"@rollup/rollup-linux-riscv64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.17.2.tgz#7ef1c781c7e59e85a6ce261cc95d7f1e0b56db0f" + integrity sha512-BOaNfthf3X3fOWAB+IJ9kxTgPmMqPPH5f5k2DcCsRrBIbWnaJCgX2ll77dV1TdSy9SaXTR5iDXRL8n7AnoP5cg== + +"@rollup/rollup-linux-s390x-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.17.2.tgz#f15775841c3232fca9b78cd25a7a0512c694b354" + integrity sha512-W0UP/x7bnn3xN2eYMql2T/+wpASLE5SjObXILTMPUBDB/Fg/FxC+gX4nvCfPBCbNhz51C+HcqQp2qQ4u25ok6g== + +"@rollup/rollup-linux-x64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.17.2.tgz#b521d271798d037ad70c9f85dd97d25f8a52e811" + integrity sha512-Hy7pLwByUOuyaFC6mAr7m+oMC+V7qyifzs/nW2OJfC8H4hbCzOX07Ov0VFk/zP3kBsELWNFi7rJtgbKYsav9QQ== + +"@rollup/rollup-linux-x64-musl@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.17.2.tgz#9254019cc4baac35800991315d133cc9fd1bf385" + integrity sha512-h1+yTWeYbRdAyJ/jMiVw0l6fOOm/0D1vNLui9iPuqgRGnXA0u21gAqOyB5iHjlM9MMfNOm9RHCQ7zLIzT0x11Q== + +"@rollup/rollup-win32-arm64-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.17.2.tgz#27f65a89f6f52ee9426ec11e3571038e4671790f" + integrity sha512-tmdtXMfKAjy5+IQsVtDiCfqbynAQE/TQRpWdVataHmhMb9DCoJxp9vLcCBjEQWMiUYxO1QprH/HbY9ragCEFLA== + +"@rollup/rollup-win32-ia32-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.17.2.tgz#a2fbf8246ed0bb014f078ca34ae6b377a90cb411" + integrity sha512-7II/QCSTAHuE5vdZaQEwJq2ZACkBpQDOmQsE6D6XUbnBHW8IAhm4eTufL6msLJorzrHDFv3CF8oCA/hSIRuZeQ== + +"@rollup/rollup-win32-x64-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.17.2.tgz#5a2d08b81e8064b34242d5cc9973ef8dd1e60503" + integrity sha512-TGGO7v7qOq4CYmSBVEYpI1Y5xDuCEnbVC5Vth8mOsW0gDSzxNrVERPc790IGHsrT2dQSimgMr9Ub3Y1Jci5/8w== "@rollup/stream@3.0.1": version "3.0.1" @@ -6191,29 +6191,29 @@ rimraf@^3.0.2: dependencies: glob "^7.1.3" -rollup@4.14.3: - version "4.14.3" - resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.14.3.tgz#bcbb7784b35826d3164346fa6d5aac95190d8ba9" - integrity sha512-ag5tTQKYsj1bhrFC9+OEWqb5O6VYgtQDO9hPDBMmIbePwhfSr+ExlcU741t8Dhw5DkPCQf6noz0jb36D6W9/hw== +rollup@4.17.2: + version "4.17.2" + resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.17.2.tgz#26d1785d0144122277fdb20ab3a24729ae68301f" + integrity sha512-/9ClTJPByC0U4zNLowV1tMBe8yMEAxewtR3cUNX5BoEpGH3dQEWpJLr6CLp0fPdYRF/fzVOgvDb1zXuakwF5kQ== dependencies: "@types/estree" "1.0.5" optionalDependencies: - "@rollup/rollup-android-arm-eabi" "4.14.3" - "@rollup/rollup-android-arm64" "4.14.3" - "@rollup/rollup-darwin-arm64" "4.14.3" - "@rollup/rollup-darwin-x64" "4.14.3" - "@rollup/rollup-linux-arm-gnueabihf" "4.14.3" - "@rollup/rollup-linux-arm-musleabihf" "4.14.3" - "@rollup/rollup-linux-arm64-gnu" "4.14.3" - "@rollup/rollup-linux-arm64-musl" "4.14.3" - "@rollup/rollup-linux-powerpc64le-gnu" "4.14.3" - "@rollup/rollup-linux-riscv64-gnu" "4.14.3" - "@rollup/rollup-linux-s390x-gnu" "4.14.3" - "@rollup/rollup-linux-x64-gnu" "4.14.3" - "@rollup/rollup-linux-x64-musl" "4.14.3" - "@rollup/rollup-win32-arm64-msvc" "4.14.3" - "@rollup/rollup-win32-ia32-msvc" "4.14.3" - "@rollup/rollup-win32-x64-msvc" "4.14.3" + "@rollup/rollup-android-arm-eabi" "4.17.2" + "@rollup/rollup-android-arm64" "4.17.2" + "@rollup/rollup-darwin-arm64" "4.17.2" + "@rollup/rollup-darwin-x64" "4.17.2" + "@rollup/rollup-linux-arm-gnueabihf" "4.17.2" + "@rollup/rollup-linux-arm-musleabihf" "4.17.2" + "@rollup/rollup-linux-arm64-gnu" "4.17.2" + "@rollup/rollup-linux-arm64-musl" "4.17.2" + "@rollup/rollup-linux-powerpc64le-gnu" "4.17.2" + "@rollup/rollup-linux-riscv64-gnu" "4.17.2" + "@rollup/rollup-linux-s390x-gnu" "4.17.2" + "@rollup/rollup-linux-x64-gnu" "4.17.2" + "@rollup/rollup-linux-x64-musl" "4.17.2" + "@rollup/rollup-win32-arm64-msvc" "4.17.2" + "@rollup/rollup-win32-ia32-msvc" "4.17.2" + "@rollup/rollup-win32-x64-msvc" "4.17.2" fsevents "~2.3.2" run-parallel@^1.1.9: From 14c54bbfb7d9305e79a2c2d016c34a655773e5cb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 1 May 2024 19:56:20 -0400 Subject: [PATCH 024/105] GH-41470: [C++] Reuse deduplication logic for direct registration (#41466) ### Rationale for this change As observed in https://github.com/apache/arrow/pull/41309 a crossbow job on mac is failing due to duplicate registration of a factory for the file:// scheme ### What changes are included in this PR? Deduplication of registered filesystem factories is applied to direct registration as well as when merging registries. ### Are these changes tested? No, we just need to verify that the problematic crossbow job is repaired. ### Are there any user-facing changes? No * GitHub Issue: #41470 Lead-authored-by: Benjamin Kietzman Co-authored-by: David Li Signed-off-by: David Li --- cpp/src/arrow/filesystem/filesystem.cc | 4 ++-- cpp/src/arrow/filesystem/localfs_test.cc | 7 ++++--- dev/tasks/java-jars/github.yml | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index b79af08385c0c..284be685fa800 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -761,8 +761,8 @@ class FileSystemFactoryRegistry { RETURN_NOT_OK(CheckValid()); auto [it, success] = scheme_to_factory_.emplace( - std::move(scheme), Registered{std::move(factory), std::move(finalizer)}); - if (success) { + std::move(scheme), Registered{factory, std::move(finalizer)}); + if (success || (it->second.ok() && it->second->factory == factory)) { return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 1a20e44bc36e2..d68c992dff863 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -154,15 +154,16 @@ TEST(FileSystemFromUri, RuntimeRegisteredFactory) { EXPECT_THAT(FileSystemFromUri("slowfile2:///hey/yo", &path), Raises(StatusCode::Invalid)); - EXPECT_THAT(RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), - Ok()); + EXPECT_THAT( + RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, __FILE__, __LINE__}), + Ok()); ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri("slowfile2:///hey/yo", &path)); EXPECT_EQ(path, "/hey/yo"); EXPECT_EQ(fs->type_name(), "slow"); EXPECT_THAT( - RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), + RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, __FILE__, __LINE__}), Raises(StatusCode::KeyError, testing::HasSubstr("Attempted to register factory for scheme 'slowfile2' " "but that scheme is already registered"))); diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 0437ee7864979..eb9478ebaa6ef 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -80,7 +80,7 @@ jobs: fail-fast: false matrix: platform: - - { runs_on: ["macos-latest"], arch: "x86_64"} + - { runs_on: ["macos-13"], arch: "x86_64"} - { runs_on: ["macos-14"], arch: "aarch_64" } env: MACOSX_DEPLOYMENT_TARGET: "10.15" From 3c67091f93223f2d12f5a73d3e5bc51e7b389a00 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Thu, 2 May 2024 08:18:21 -0400 Subject: [PATCH 025/105] GH-41491: [Python] remove special methods related to buffers in python <2.6 (#41492) ### Rationale for this change These methods are not actually used and will be removed from Cython in an upcoming release. Closes #41491 ### What changes are included in this PR? ### Are these changes tested? Trust CI ### Are there any user-facing changes? No, this code should never be actually used. * GitHub Issue: #41491 Authored-by: Thomas A Caswell Signed-off-by: Joris Van den Bossche --- python/pyarrow/io.pxi | 47 ++++++++++++------------------------------- 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7890bf4b2dd76..9e8026deb435c 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1446,27 +1446,6 @@ cdef class Buffer(_Weakrefable): buffer.strides = self.strides buffer.suboffsets = NULL - def __getsegcount__(self, Py_ssize_t *len_out): - if len_out != NULL: - len_out[0] = self.size - return 1 - - def __getreadbuffer__(self, Py_ssize_t idx, void **p): - if idx != 0: - raise SystemError("accessing nonexistent buffer segment") - if p != NULL: - p[0] = self.buffer.get().data() - return self.size - - def __getwritebuffer__(self, Py_ssize_t idx, void **p): - if not self.buffer.get().is_mutable(): - raise SystemError("trying to write an immutable buffer") - if idx != 0: - raise SystemError("accessing nonexistent buffer segment") - if p != NULL: - p[0] = self.buffer.get().data() - return self.size - cdef class ResizableBuffer(Buffer): """ @@ -2142,21 +2121,21 @@ cdef class CacheOptions(_Weakrefable): Parameters ---------- hole_size_limit : int, default 8KiB - The maximum distance in bytes between two consecutive ranges; beyond + The maximum distance in bytes between two consecutive ranges; beyond this value, ranges are not combined. range_size_limit : int, default 32MiB - The maximum size in bytes of a combined range; if combining two - consecutive ranges would produce a range of a size greater than this, + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, they are not combined lazy : bool, default True lazy = false: request all byte ranges when PreBuffer or WillNeed is called. - lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader - needs them. - lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the range that is currently being read. prefetch_limit : int, default 0 - The maximum number of ranges to be prefetched. This is only used for - lazy cache to asynchronously read some ranges after reading the target + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target range. """ @@ -2227,19 +2206,19 @@ cdef class CacheOptions(_Weakrefable): """ Create suiteable CacheOptions based on provided network metrics. - Typically this will be used with object storage solutions like Amazon S3, + Typically this will be used with object storage solutions like Amazon S3, Google Cloud Storage and Azure Blob Storage. Parameters ---------- time_to_first_byte_millis : int - Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call - setup latency of a new read request. The value is a positive integer. + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. transfer_bandwidth_mib_per_sec : int - Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive integer. ideal_bandwidth_utilization_frac : int, default 0.9 - Transfer bandwidth utilization fraction (per connection) to maximize the net + Transfer bandwidth utilization fraction (per connection) to maximize the net data load. The value is a positive float less than 1. max_ideal_request_size_mib : int, default 64 The maximum single data request size (in MiB) to maximize the net data load. From 49bf3d9bf2ca266fcf63dca1e57bdb83c9559b72 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 3 May 2024 06:10:20 +0900 Subject: [PATCH 026/105] GH-41467: [CI][Release] Don't push conda-verify-rc image (#41468) ### Rationale for this change Because it uses ubuntu:20.04 image directly. We don't build our image for it. ### What changes are included in this PR? Don't push an image for `conda-verify-rc`. ### Are these changes tested? No. ### Are there any user-facing changes? No. * GitHub Issue: #41467 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/tasks/verify-rc/github.linux.amd64.docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/verify-rc/github.linux.amd64.docker.yml b/dev/tasks/verify-rc/github.linux.amd64.docker.yml index 65b30b5c8d4df..7a28ba705dd50 100644 --- a/dev/tasks/verify-rc/github.linux.amd64.docker.yml +++ b/dev/tasks/verify-rc/github.linux.amd64.docker.yml @@ -43,7 +43,7 @@ jobs: -e TEST_{{ target|upper }}=1 \ {{ distro }}-verify-rc - {% if arrow.is_default_branch() %} + {% if arrow.is_default_branch() and distro != "conda" %} {{ macros.github_login_dockerhub()|indent }} - name: Push Docker Image shell: bash From 71e38fc8a9fdf102a5136793b738d7650ca053fa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 06:50:42 +0900 Subject: [PATCH 027/105] MINOR: [JS] Bump @typescript-eslint/eslint-plugin from 7.7.0 to 7.8.0 in /js (#41485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [@ typescript-eslint/eslint-plugin](https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin) from 7.7.0 to 7.8.0.
Release notes

Sourced from @​typescript-eslint/eslint-plugin's releases.

v7.8.0

7.8.0 (2024-04-29)

🚀 Features

  • rule-tester: assert suggestion messages are unique (#8995)
  • typescript-estree: add maximumDefaultProjectFileMatchCount and wide allowDefaultProjectForFiles glob restrictions (#8925)

🩹 Fixes

  • eslint-plugin: [no-unsafe-argument] handle tagged templates (#8746)
  • eslint-plugin: [prefer-optional-chain] suggests optional chaining during strict null equality check (#8717)
  • eslint-plugin: [consistent-type-assertions] handle tagged templates (#8993)
  • eslint-plugin: [no-unsafe-return] handle union types (#9001)
  • eslint-plugin: [no-unused-vars] clear error report range (#8640)
  • utils: export ESLint backwards-compat functions (#8976)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

v7.7.1

7.7.1 (2024-04-22)

🩹 Fixes

  • eslint-plugin: [no-unsafe-assignment] handle shorthand property assignment (#8800)
  • eslint-plugin: [explicit-function-return-type] fix checking wrong ancestor's return type (#8809)
  • eslint-plugin: [prefer-optional-chain] only look at left operand for requireNullish (#8559)
  • eslint-plugin: [no-for-in-array] refine report location (#8874)
  • eslint-plugin: [no-unnecessary-type-assertion] allow non-null assertion for void type (#8912)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

Changelog

Sourced from @​typescript-eslint/eslint-plugin's changelog.

7.8.0 (2024-04-29)

🩹 Fixes

  • eslint-plugin: [no-unsafe-argument] handle tagged templates

  • eslint-plugin: [prefer-optional-chain] suggests optional chaining during strict null equality check

  • eslint-plugin: [consistent-type-assertions] handle tagged templates

  • eslint-plugin: [no-unsafe-return] handle union types

  • eslint-plugin: [no-unused-vars] clear error report range

❤️ Thank You

  • auvred
  • Josh Goldberg ✨
  • jsfm01
  • Kim Sang Du
  • YeonJuan

You can read about our versioning strategy and releases on our website.

7.7.1 (2024-04-22)

🩹 Fixes

  • eslint-plugin: [no-unsafe-assignment] handle shorthand property assignment

  • eslint-plugin: [explicit-function-return-type] fix checking wrong ancestor's return type

  • eslint-plugin: [prefer-optional-chain] only look at left operand for requireNullish

  • eslint-plugin: [no-for-in-array] refine report location

  • eslint-plugin: [no-unnecessary-type-assertion] allow non-null assertion for void type

❤️ Thank You

  • Abraham Guo
  • Kirk Waiblinger
  • YeonJuan

You can read about our versioning strategy and releases on our website.

Commits
  • ee677f6 chore(release): publish 7.8.0
  • 8127873 fix(eslint-plugin): [no-unused-vars] clear error report range (#8640)
  • 216d1b0 fix(eslint-plugin): [no-unsafe-return] handle union types (#9001)
  • 51d2193 fix(eslint-plugin): [consistent-type-assertions] handle tagged templates (#8993)
  • 4bed24d fix(eslint-plugin): [prefer-optional-chain] suggests optional chaining during...
  • b0f7aa4 fix(eslint-plugin): [no-unsafe-argument] handle tagged templates (#8746)
  • 219b841 chore: resolve lint issues on main branch (#8966)
  • 3e19436 chore(release): publish 7.7.1
  • b2552ca fix(eslint-plugin): [no-unnecessary-type-assertion] allow non-null assertion ...
  • fdeba42 fix(eslint-plugin): [no-for-in-array] refine report location (#8874)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ typescript-eslint/eslint-plugin&package-manager=npm_and_yarn&previous-version=7.7.0&new-version=7.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 77 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/js/package.json b/js/package.json index 8cab229521f79..e9590a188820f 100644 --- a/js/package.json +++ b/js/package.json @@ -72,7 +72,7 @@ "@types/glob": "8.1.0", "@types/jest": "29.5.12", "@types/multistream": "4.1.3", - "@typescript-eslint/eslint-plugin": "7.7.0", + "@typescript-eslint/eslint-plugin": "7.8.0", "@typescript-eslint/parser": "7.7.0", "async-done": "2.0.0", "benny": "3.7.1", diff --git a/js/yarn.lock b/js/yarn.lock index fe483ab8aca36..ab092675b4806 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1436,16 +1436,16 @@ dependencies: "@types/yargs-parser" "*" -"@typescript-eslint/eslint-plugin@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.7.0.tgz#bf34a02f221811505b8bf2f31060c8560c1bb0a3" - integrity sha512-GJWR0YnfrKnsRoluVO3PRb9r5aMZriiMMM/RHj5nnTrBy1/wIgk76XCtCKcnXGjpZQJQRFtGV9/0JJ6n30uwpQ== +"@typescript-eslint/eslint-plugin@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.8.0.tgz#c78e309fe967cb4de05b85cdc876fb95f8e01b6f" + integrity sha512-gFTT+ezJmkwutUPmB0skOj3GZJtlEGnlssems4AjkVweUPGj7jRwwqg0Hhg7++kPGJqKtTYx+R05Ftww372aIg== dependencies: "@eslint-community/regexpp" "^4.10.0" - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/type-utils" "7.7.0" - "@typescript-eslint/utils" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/type-utils" "7.8.0" + "@typescript-eslint/utils" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" graphemer "^1.4.0" ignore "^5.3.1" @@ -1480,13 +1480,21 @@ "@typescript-eslint/types" "7.7.0" "@typescript-eslint/visitor-keys" "7.7.0" -"@typescript-eslint/type-utils@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.7.0.tgz#36792ff4209a781b058de61631a48df17bdefbc5" - integrity sha512-bOp3ejoRYrhAlnT/bozNQi3nio9tIgv3U5C0mVDdZC7cpcQEDZXvq8inrHYghLVwuNABRqrMW5tzAv88Vy77Sg== +"@typescript-eslint/scope-manager@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.8.0.tgz#bb19096d11ec6b87fb6640d921df19b813e02047" + integrity sha512-viEmZ1LmwsGcnr85gIq+FCYI7nO90DVbE37/ll51hjv9aG+YZMb4WDE2fyWpUR4O/UrhGRpYXK/XajcGTk2B8g== dependencies: - "@typescript-eslint/typescript-estree" "7.7.0" - "@typescript-eslint/utils" "7.7.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" + +"@typescript-eslint/type-utils@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.8.0.tgz#9de166f182a6e4d1c5da76e94880e91831e3e26f" + integrity sha512-H70R3AefQDQpz9mGv13Uhi121FNMh+WEaRqcXTX09YEDky21km4dV1ZXJIp8QjXc4ZaVkXVdohvWDzbnbHDS+A== + dependencies: + "@typescript-eslint/typescript-estree" "7.8.0" + "@typescript-eslint/utils" "7.8.0" debug "^4.3.4" ts-api-utils "^1.3.0" @@ -1500,6 +1508,11 @@ resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.7.0.tgz#23af4d24bf9ce15d8d301236e3e3014143604f27" integrity sha512-G01YPZ1Bd2hn+KPpIbrAhEWOn5lQBrjxkzHkWvP6NucMXFtfXoevK82hzQdpfuQYuhkvFDeQYbzXCjR1z9Z03w== +"@typescript-eslint/types@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.8.0.tgz#1fd2577b3ad883b769546e2d1ef379f929a7091d" + integrity sha512-wf0peJ+ZGlcH+2ZS23aJbOv+ztjeeP8uQ9GgwMJGVLx/Nj9CJt17GWgWWoSmoRVKAX2X+7fzEnAjxdvK2gqCLw== + "@typescript-eslint/typescript-estree@5.62.0": version "5.62.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-5.62.0.tgz#7d17794b77fabcac615d6a48fb143330d962eb9b" @@ -1527,17 +1540,31 @@ semver "^7.6.0" ts-api-utils "^1.3.0" -"@typescript-eslint/utils@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.7.0.tgz#3d2b6606a60ac34f3c625facfb3b3ab7e126f58d" - integrity sha512-LKGAXMPQs8U/zMRFXDZOzmMKgFv3COlxUQ+2NMPhbqgVm6R1w+nU1i4836Pmxu9jZAuIeyySNrN/6Rc657ggig== +"@typescript-eslint/typescript-estree@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.8.0.tgz#b028a9226860b66e623c1ee55cc2464b95d2987c" + integrity sha512-5pfUCOwK5yjPaJQNy44prjCwtr981dO8Qo9J9PwYXZ0MosgAbfEMB008dJ5sNo3+/BN6ytBPuSvXUg9SAqB0dg== + dependencies: + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" + debug "^4.3.4" + globby "^11.1.0" + is-glob "^4.0.3" + minimatch "^9.0.4" + semver "^7.6.0" + ts-api-utils "^1.3.0" + +"@typescript-eslint/utils@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.8.0.tgz#57a79f9c0c0740ead2f622e444cfaeeb9fd047cd" + integrity sha512-L0yFqOCflVqXxiZyXrDr80lnahQfSOfc9ELAAZ75sqicqp2i36kEZZGuUymHNFoYOqxRT05up760b4iGsl02nQ== dependencies: "@eslint-community/eslint-utils" "^4.4.0" "@types/json-schema" "^7.0.15" "@types/semver" "^7.5.8" - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/typescript-estree" "7.7.0" + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/typescript-estree" "7.8.0" semver "^7.6.0" "@typescript-eslint/utils@^5.10.0": @@ -1570,6 +1597,14 @@ "@typescript-eslint/types" "7.7.0" eslint-visitor-keys "^3.4.3" +"@typescript-eslint/visitor-keys@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.8.0.tgz#7285aab991da8bee411a42edbd5db760d22fdd91" + integrity sha512-q4/gibTNBQNA0lGyYQCmWRS5D15n8rXh4QjK3KV+MBPlTYHpfBUT3D3PaPR/HeNiI9W6R7FvlkcGhNyAoP+caA== + dependencies: + "@typescript-eslint/types" "7.8.0" + eslint-visitor-keys "^3.4.3" + "@ungap/structured-clone@^1.2.0": version "1.2.0" resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" From 9749d7d653e1b106d0662624b22b2982b3ad0516 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 2 May 2024 22:02:00 -0300 Subject: [PATCH 028/105] GH-39798: [C++] Optimize Take for fixed-size types including nested fixed-size lists (#41297) ### Rationale for this change Introduce utilities for dealing with fixed-width types (including fixed-size lists of fixed-width types) generically. And use it for initial optimizations of `Take` and `Filter`. ### What changes are included in this PR? - [x] Introduce utilities for dealing with fixed-width types generically - [x] Use faster `Take` kernel on small power-of-2 byte widths of fixed-width types - [x] from `FSLTakeExec` (including FSLs of FSBs) - [x] from `FSBTakeExec` (done before this PR) - [x] ~Take on any fixed-width type~ (as a separate issue #41301) - [x] Use faster `Filter` kernel on both primitive and fixed-width types of any length - [x] from `FSLFilterExec` (including FSLs of FSBs) - [x] from `FSBFilterExec` (done before this PR) ### Are these changes tested? By existing and new tests. ### Are there any user-facing changes? Some functions added to the `arrow::util` namespace and documented inline. * GitHub Issue: #39798 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/CMakeLists.txt | 1 + .../vector_selection_filter_internal.cc | 30 +- .../kernels/vector_selection_internal.cc | 56 +++- .../kernels/vector_selection_internal.h | 7 +- .../kernels/vector_selection_take_internal.cc | 39 ++- .../compute/kernels/vector_selection_test.cc | 156 +++++++-- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/fixed_width_internal.cc | 226 +++++++++++++ cpp/src/arrow/util/fixed_width_internal.h | 307 ++++++++++++++++++ cpp/src/arrow/util/fixed_width_test.cc | 217 +++++++++++++ cpp/src/arrow/util/fixed_width_test_util.h | 203 ++++++++++++ 11 files changed, 1171 insertions(+), 72 deletions(-) create mode 100644 cpp/src/arrow/util/fixed_width_internal.cc create mode 100644 cpp/src/arrow/util/fixed_width_internal.h create mode 100644 cpp/src/arrow/util/fixed_width_test.cc create mode 100644 cpp/src/arrow/util/fixed_width_test_util.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2ef82dd614f84..5d61112518f5e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -501,6 +501,7 @@ set(ARROW_UTIL_SRCS util/decimal.cc util/delimiting.cc util/dict_util.cc + util/fixed_width_internal.cc util/float16.cc util/formatting.cc util/future.cc diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 8825d697fdf77..d5e5e5ad289ac 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -40,6 +40,7 @@ #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/fixed_width_internal.h" namespace arrow { @@ -158,9 +159,11 @@ class PrimitiveFilterImpl { PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, ArrayData* out_arr) - : byte_width_(values.type->byte_width()), + : byte_width_(util::FixedWidthInBytes(*values.type)), values_is_valid_(values.buffers[0].data), - values_data_(values.buffers[1].data), + // No offset applied for boolean because it's a bitmap + values_data_(kIsBoolean ? values.buffers[1].data + : util::OffsetPointerOfFixedWidthValues(values)), values_null_count_(values.null_count), values_offset_(values.offset), values_length_(values.length), @@ -169,17 +172,13 @@ class PrimitiveFilterImpl { if constexpr (kByteWidth >= 0 && !kIsBoolean) { DCHECK_EQ(kByteWidth, byte_width_); } - if constexpr (!kIsBoolean) { - // No offset applied for boolean because it's a bitmap - values_data_ += values.offset * byte_width(); - } + DCHECK_EQ(out_arr->offset, 0); if (out_arr->buffers[0] != nullptr) { // May be unallocated if neither filter nor values contain nulls out_is_valid_ = out_arr->buffers[0]->mutable_data(); } - out_data_ = out_arr->buffers[1]->mutable_data(); - DCHECK_EQ(out_arr->offset, 0); + out_data_ = util::MutableFixedWidthValuesPointer(out_arr); out_length_ = out_arr->length; out_position_ = 0; } @@ -416,7 +415,7 @@ class PrimitiveFilterImpl { out_position_ += length; } - constexpr int32_t byte_width() const { + constexpr int64_t byte_width() const { if constexpr (kByteWidth >= 0) { return kByteWidth; } else { @@ -425,7 +424,7 @@ class PrimitiveFilterImpl { } private: - int32_t byte_width_; + int64_t byte_width_; const uint8_t* values_is_valid_; const uint8_t* values_data_; int64_t values_null_count_; @@ -439,6 +438,8 @@ class PrimitiveFilterImpl { int64_t out_position_; }; +} // namespace + Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; const ArraySpan& filter = batch[1].array; @@ -468,9 +469,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult // validity bitmap. const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; - const int bit_width = values.type->bit_width(); - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width, - allocate_validity, out_arr)); + DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, output_length, /*source=*/values, allocate_validity, out_arr)); switch (bit_width) { case 1: @@ -505,6 +507,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return Status::OK(); } +namespace { + // ---------------------------------------------------------------------- // Optimized filter for base binary types (32-bit and 64-bit) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index a0fe2808e3e4e..93cd5060348db 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" @@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, DCHECK_OK(registry->AddFunction(std::move(func))); } -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out) { - // Preallocate memory - out->length = length; - out->buffers.resize(2); - - if (allocate_validity) { - ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); - } - if (bit_width == 1) { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); - } else { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], - ctx->Allocate(bit_util::BytesForBits(length * bit_width))); - } - return Status::OK(); -} - namespace { /// \brief Iterate over a REE filter, emitting ranges of a plain values array that @@ -909,6 +892,20 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult } Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveFilterExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_dictionary=*/true)) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec + // might not handle it correctly. + if (byte_width > 0) { + return PrimitiveFilterExec(ctx, batch, out); + } + } return FilterExec(ctx, batch, out); } @@ -968,6 +965,29 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* } Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveTakeExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_dictionary=*/true)) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // Additionally, PrimitiveTakeExec is only implemented for specific byte widths. + // TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type. + switch (byte_width) { + case 1: + case 2: + case 4: + case 8: + case 16: + case 32: + return PrimitiveTakeExec(ctx, batch, out); + default: + break; // fallback to TakeExec + } + } + return TakeExec(ctx, batch, out); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index 95f3e51cd67e3..a169f4b38a2b8 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, const FunctionOptions* default_options, FunctionRegistry* registry); -/// \brief Allocate an ArrayData for a primitive array with a given length and bit width -/// -/// \param[in] bit_width 1 or a multiple of 8 -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out); - /// \brief Callback type for VisitPlainxREEFilterOutputSegments. /// /// position is the logical position in the values array relative to its offset. @@ -70,6 +64,7 @@ void VisitPlainxREEFilterOutputSegments( FilterOptions::NullSelectionBehavior null_selection, const EmitREEFilterSegment& emit_segment); +Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 5cd3710828485..48a2de9936cd4 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/ree_util.h" @@ -323,7 +324,7 @@ namespace { using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- -// Implement optimized take for primitive types from boolean to 1/2/4/8-byte +// Implement optimized take for primitive types from boolean to 1/2/4/8/16/32-byte // C-type based types. Use common implementation for every byte width and only // generate code for unsigned integer indices, since after boundschecking to // check for negative numbers in the indices we can safely reinterpret_cast @@ -333,16 +334,20 @@ using TakeState = OptionsWrapper; /// use the logical Arrow type but rather the physical C type. This way we /// only generate one take function for each byte width. /// -/// This function assumes that the indices have been boundschecked. +/// Also note that this function can also handle fixed-size-list arrays if +/// they fit the criteria described in fixed_width_internal.h, so use the +/// function defined in that file to access values and destination pointers +/// and DO NOT ASSUME `values.type()` is a primitive type. +/// +/// \pre the indices have been boundschecked template struct PrimitiveTakeImpl { static constexpr int kValueWidth = ValueWidthConstant::value; static void Exec(const ArraySpan& values, const ArraySpan& indices, ArrayData* out_arr) { - DCHECK_EQ(values.type->byte_width(), kValueWidth); - const auto* values_data = - values.GetValues(1, 0) + kValueWidth * values.offset; + DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth); + const auto* values_data = util::OffsetPointerOfFixedWidthValues(values); const uint8_t* values_is_valid = values.buffers[0].data; auto values_offset = values.offset; @@ -350,16 +355,15 @@ struct PrimitiveTakeImpl { const uint8_t* indices_is_valid = indices.buffers[0].data; auto indices_offset = indices.offset; - auto out = out_arr->GetMutableValues(1, 0) + kValueWidth * out_arr->offset; + DCHECK_EQ(out_arr->offset, 0); + auto* out = util::MutableFixedWidthValuesPointer(out_arr); auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - DCHECK_EQ(out_offset, 0); // If either the values or indices have nulls, we preemptively zero out the // out validity bitmap so that we don't have to use ClearBit in each // iteration for nulls. if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); + bit_util::SetBitsTo(out_is_valid, 0, indices.length, false); } auto WriteValue = [&](int64_t position) { @@ -386,7 +390,7 @@ struct PrimitiveTakeImpl { valid_count += block.popcount; if (block.popcount == block.length) { // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); + bit_util::SetBitsTo(out_is_valid, position, block.length, true); for (int64_t i = 0; i < block.length; ++i) { WriteValue(position); ++position; @@ -396,7 +400,7 @@ struct PrimitiveTakeImpl { for (int64_t i = 0; i < block.length; ++i) { if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); WriteValue(position); } else { WriteZero(position); @@ -416,7 +420,7 @@ struct PrimitiveTakeImpl { values_offset + indices_data[position])) { // value is not null WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); ++valid_count; } else { WriteZero(position); @@ -433,7 +437,7 @@ struct PrimitiveTakeImpl { values_offset + indices_data[position])) { // index is not null && value is not null WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); ++valid_count; } else { WriteZero(position); @@ -584,14 +588,17 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ArrayData* out_arr = out->array_data().get(); - const int bit_width = values.type->bit_width(); + DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false, + /*exclude_dictionary=*/true)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); // TODO: When neither values nor indices contain nulls, we can skip // allocating the validity bitmap altogether and save time and space. A // streamlined PrimitiveTakeImpl would need to be written that skips all // interactions with the output validity bitmap, though. - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, bit_width, - /*allocate_validity=*/true, out_arr)); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, indices.length, /*source=*/values, + /*allocate_validity=*/true, out_arr)); switch (bit_width) { case 1: TakeIndexDispatch(values, indices, out_arr); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index ec94b328ea361..4c7d85b103f36 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/array/builder_nested.h" #include "arrow/array/concatenate.h" #include "arrow/chunked_array.h" #include "arrow/compute/api.h" @@ -32,6 +33,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" +#include "arrow/util/fixed_width_test_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -726,7 +728,37 @@ TEST_F(TestFilterKernelWithLargeList, FilterListInt32) { "[[1,2], null, null]"); } -class TestFilterKernelWithFixedSizeList : public TestFilterKernel {}; +class TestFilterKernelWithFixedSizeList : public TestFilterKernel { + protected: + std::vector> five_length_filters_ = { + ArrayFromJSON(boolean(), "[false, false, false, false, false]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), + ArrayFromJSON(boolean(), "[false, true, true, false, true]"), + ArrayFromJSON(boolean(), "[null, true, null, false, true]"), + }; + + void AssertFilterOnNestedLists(const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + using NLG = ::arrow::util::internal::NestedListGenerator; + constexpr int64_t kLength = 5; + // Create two equivalent lists: one as a FixedSizeList and another as a List. + ASSERT_OK_AND_ASSIGN(auto fsl_list, + NLG::NestedFSLArray(inner_type, list_sizes, kLength)); + ASSERT_OK_AND_ASSIGN(auto list, + NLG::NestedListArray(inner_type, list_sizes, kLength)); + + ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`"); + + for (auto& filter : five_length_filters_) { + // Use the Filter on ListType as the reference implementation. + ASSERT_OK_AND_ASSIGN(auto expected_list, + Filter(*list, *filter, /*options=*/emit_null_)); + ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(expected_list, fsl_list->type())); + auto expected_fsl_array = expected_fsl.make_array(); + this->AssertFilter(fsl_list, filter, expected_fsl_array); + } + } +}; TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; @@ -740,6 +772,33 @@ TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { "[[1, null, 3], [7, 8, null]]"); } +TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListVarWidth) { + std::string list_json = + R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])"; + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 1, null]", + R"([["two", "", "three"], ["four", "five", "six"], null])"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 1, null]", + R"([["four", "five", "six"], null])"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[1, 1, 1, 1]", list_json); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 0, 1]", + R"([["two", "", "three"], ["seven", "eight", ""]])"); +} + +TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListModuloNesting) { + using NLG = ::arrow::util::internal::NestedListGenerator; + const std::vector> value_types = { + int16(), + int32(), + int64(), + }; + NLG::VisitAllNestedListConfigurations( + value_types, [this](const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + this->AssertFilterOnNestedLists(inner_type, list_sizes); + }); +} + class TestFilterKernelWithMap : public TestFilterKernel {}; TEST_F(TestFilterKernelWithMap, FilterMapStringToInt32) { @@ -1034,29 +1093,34 @@ Status TakeJSON(const std::shared_ptr& type, const std::string& values .Value(out); } +void DoCheckTake(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& expected) { + AssertTakeArrays(values, indices, expected); + + // Check sliced values + ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(values->type(), 2)); + ASSERT_OK_AND_ASSIGN(auto values_sliced, + Concatenate({values_filler, values, values_filler})); + values_sliced = values_sliced->Slice(2, values->length()); + AssertTakeArrays(values_sliced, indices, expected); + + // Check sliced indices + ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(indices->type(), int8_t{0})); + ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3)); + ASSERT_OK_AND_ASSIGN(auto indices_sliced, + Concatenate({indices_filler, indices, indices_filler})); + indices_sliced = indices_sliced->Slice(3, indices->length()); + AssertTakeArrays(values, indices_sliced, expected); +} + void CheckTake(const std::shared_ptr& type, const std::string& values_json, const std::string& indices_json, const std::string& expected_json) { auto values = ArrayFromJSON(type, values_json); auto expected = ArrayFromJSON(type, expected_json); - for (auto index_type : {int8(), uint32()}) { auto indices = ArrayFromJSON(index_type, indices_json); - AssertTakeArrays(values, indices, expected); - - // Check sliced values - ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(type, 2)); - ASSERT_OK_AND_ASSIGN(auto values_sliced, - Concatenate({values_filler, values, values_filler})); - values_sliced = values_sliced->Slice(2, values->length()); - AssertTakeArrays(values_sliced, indices, expected); - - // Check sliced indices - ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(index_type, int8_t{0})); - ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3)); - ASSERT_OK_AND_ASSIGN(auto indices_sliced, - Concatenate({indices_filler, indices, indices_filler})); - indices_sliced = indices_sliced->Slice(3, indices->length()); - AssertTakeArrays(values, indices_sliced, expected); + DoCheckTake(values, indices, expected); } } @@ -1427,7 +1491,25 @@ TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) { CheckTake(large_list(int32()), list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]"); } -class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped {}; +class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped { + protected: + void CheckTakeOnNestedLists(const std::shared_ptr& inner_type, + const std::vector& list_sizes, int64_t length) { + using NLG = ::arrow::util::internal::NestedListGenerator; + // Create two equivalent lists: one as a FixedSizeList and another as a List. + ASSERT_OK_AND_ASSIGN(auto fsl_list, + NLG::NestedFSLArray(inner_type, list_sizes, length)); + ASSERT_OK_AND_ASSIGN(auto list, NLG::NestedListArray(inner_type, list_sizes, length)); + + ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`"); + + auto indices = ArrayFromJSON(int64(), "[1, 2, 4]"); + // Use the Take on ListType as the reference implementation. + ASSERT_OK_AND_ASSIGN(auto expected_list, Take(*list, *indices)); + ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(*expected_list, fsl_list->type())); + DoCheckTake(fsl_list, indices, expected_fsl); + } +}; TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; @@ -1449,6 +1531,42 @@ TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { "[0, 1, 0]"); } +TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListVarWidth) { + std::string list_json = + R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])"; + CheckTake(fixed_size_list(utf8(), 3), list_json, "[]", "[]"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[3, 2, 1]", + R"([["seven", "eight", ""], ["four", "five", "six"], ["two", "", "three"]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[null, 2, 0]", + R"([null, ["four", "five", "six"], ["zero", "one", ""]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, R"([null, null])", "[null, null]"); + CheckTake( + fixed_size_list(utf8(), 3), list_json, "[3, 0, 0,3]", + R"([["seven", "eight", ""], ["zero", "one", ""], ["zero", "one", ""], ["seven", "eight", ""]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[0, 1, 2, 3]", list_json); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[2, 2, 2, 2, 2, 2, 1]", + R"([ + ["four", "five", "six"], ["four", "five", "six"], + ["four", "five", "six"], ["four", "five", "six"], + ["four", "five", "six"], ["four", "five", "six"], + ["two", "", "three"] + ])"); +} + +TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListModuloNesting) { + using NLG = ::arrow::util::internal::NestedListGenerator; + const std::vector> value_types = { + int16(), + int32(), + int64(), + }; + NLG::VisitAllNestedListConfigurations( + value_types, [this](const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + this->CheckTakeOnNestedLists(inner_type, list_sizes, /*length=*/5); + }); +} + class TestTakeKernelWithMap : public TestTakeKernelTyped {}; TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) { diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index e26efba28594b..087e4e3879e56 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -56,6 +56,7 @@ add_arrow_test(utility-test compression_test.cc decimal_test.cc float16_test.cc + fixed_width_test.cc formatting_util_test.cc key_value_metadata_test.cc hashing_test.cc diff --git a/cpp/src/arrow/util/fixed_width_internal.cc b/cpp/src/arrow/util/fixed_width_internal.cc new file mode 100644 index 0000000000000..164af3cff66b3 --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_internal.cc @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/compute/kernel.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/fixed_width_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" + +namespace arrow::util { + +using ::arrow::internal::checked_cast; + +bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, + bool exclude_dictionary) { + return IsFixedWidthLike(source, force_null_count, + [exclude_dictionary](const DataType& type) { + return !exclude_dictionary || type.id() != Type::DICTIONARY; + }); +} + +static int64_t FixedWidthInBytesFallback(const FixedSizeListType& fixed_size_list_type) { + auto* fsl = &fixed_size_list_type; + int64_t list_size = fsl->list_size(); + for (auto type = fsl->value_type().get();;) { + if (type->id() == Type::FIXED_SIZE_LIST) { + fsl = checked_cast(type); + list_size *= fsl->list_size(); + type = fsl->value_type().get(); + continue; + } + if (type->id() != Type::BOOL && is_fixed_width(type->id())) { + const int64_t flat_byte_width = list_size * type->byte_width(); + DCHECK_GE(flat_byte_width, 0); + return flat_byte_width; + } + break; + } + return -1; +} + +int64_t FixedWidthInBytes(const DataType& type) { + auto type_id = type.id(); + if (is_fixed_width(type_id)) { + const int32_t num_bits = type.bit_width(); + return (type_id == Type::BOOL) ? -1 : num_bits / 8; + } + if (type_id == Type::FIXED_SIZE_LIST) { + auto& fsl = ::arrow::internal::checked_cast(type); + return FixedWidthInBytesFallback(fsl); + } + return -1; +} + +int64_t FixedWidthInBits(const DataType& type) { + auto type_id = type.id(); + if (is_fixed_width(type_id)) { + return type.bit_width(); + } + const int64_t byte_width = FixedWidthInBytes(type); + if (ARROW_PREDICT_FALSE(byte_width < 0)) { + return -1; + } + return byte_width * 8; +} + +namespace internal { + +Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, + int64_t length, const ArraySpan& source, + bool allocate_validity, ArrayData* out) { + DCHECK(!source.MayHaveNulls() || allocate_validity) + << "allocate_validity cannot be false if source may have nulls"; + DCHECK_EQ(source.type->id(), out->type->id()); + auto* type = source.type; + out->length = length; + if (type->id() == Type::FIXED_SIZE_LIST) { + out->buffers.resize(1); + out->child_data = {std::make_shared()}; + } else { + out->buffers.resize(2); + } + if (allocate_validity) { + ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); + } + + if (type->id() == Type::BOOL) { + ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); + return Status::OK(); + } + if (is_fixed_width(type->id())) { + if (type->id() == Type::DICTIONARY) { + return Status::NotImplemented( + "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); + } + ARROW_ASSIGN_OR_RAISE(out->buffers[1], + ctx->Allocate(length * source.type->byte_width())); + return Status::OK(); + } + if (type->id() == Type::FIXED_SIZE_LIST) { + auto& fsl_type = checked_cast(*type); + auto& value_type = fsl_type.value_type(); + if (ARROW_PREDICT_FALSE(value_type->id() == Type::BOOL)) { + return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", fsl_type); + } + if (ARROW_PREDICT_FALSE(value_type->id() == Type::DICTIONARY)) { + return Status::NotImplemented( + "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); + } + if (source.child_data[0].MayHaveNulls()) { + return Status::Invalid( + "PreallocateFixedWidthArrayData: " + "FixedSizeList may have null values in child array: ", + fsl_type); + } + auto* child_values = out->child_data[0].get(); + child_values->type = value_type; + return PreallocateFixedWidthArrayData(ctx, length * fsl_type.list_size(), + /*source=*/source.child_data[0], + /*allocate_validity=*/false, + /*out=*/child_values); + } + return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", *type); +} + +} // namespace internal + +/// \pre same as OffsetPointerOfFixedWidthValues +/// \pre source.type->id() != Type::BOOL +static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& source) { + using OffsetAndListSize = std::pair; + auto get_offset = [](auto pair) { return pair.first; }; + auto get_list_size = [](auto pair) { return pair.second; }; + ::arrow::internal::SmallVector stack; + + DCHECK_NE(source.type->id(), Type::BOOL); + + int64_t list_size = 1; + auto* array = &source; + while (array->type->id() == Type::FIXED_SIZE_LIST) { + list_size *= checked_cast(array->type)->list_size(); + stack.emplace_back(array->offset, list_size); + array = &array->child_data[0]; + } + // Now that innermost values were reached, pop the stack and calculate the offset + // in bytes of the innermost values buffer by considering the offset at each + // level of nesting. + DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); + DCHECK(array == &source || !array->MayHaveNulls()) + << "OffsetPointerOfFixedWidthValues: array is expected to be flat or have no " + "nulls in the arrays nested by FIXED_SIZE_LIST."; + int64_t value_width = array->type->byte_width(); + int64_t offset_in_bytes = array->offset * value_width; + for (auto it = stack.rbegin(); it != stack.rend(); ++it) { + value_width *= get_list_size(*it); + offset_in_bytes += get_offset(*it) * value_width; + } + return value_width < 0 ? nullptr : array->GetValues(1, offset_in_bytes); +} + +const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) { + auto type_id = source.type->id(); + if (is_fixed_width(type_id)) { + if (ARROW_PREDICT_FALSE(type_id == Type::BOOL)) { + // BOOL arrays are bit-packed, thus a byte-aligned pointer cannot be produced in the + // general case. Returning something for BOOL arrays that happen to byte-align + // because offset=0 would create too much confusion. + return nullptr; + } + return source.GetValues(1, 0) + source.offset * source.type->byte_width(); + } + return OffsetPointerOfFixedWidthValuesFallback(source); +} + +/// \brief Get the mutable pointer to the fixed-width values of an array +/// allocated by PreallocateFixedWidthArrayData. +/// +/// \pre mutable_array->offset and the offset of child array (if it's a +/// FixedSizeList) MUST be 0 (recursively). +/// \pre IsFixedWidthLike(ArraySpan(mutable_array)) or the more restrictive +/// is_fixed_width(*mutable_array->type) MUST be true +/// \return The mutable pointer to the fixed-width byte blocks of the array. If +/// pre-conditions are not satisfied, the return values is undefined. +uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array) { + auto type_id = mutable_array->type->id(); + if (type_id == Type::FIXED_SIZE_LIST) { + auto* array = mutable_array; + do { + DCHECK_EQ(array->offset, 0); + DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of " + << mutable_array->type->ToString(true); + array = array->child_data[0].get(); + } while (array->type->id() == Type::FIXED_SIZE_LIST); + DCHECK_EQ(array->offset, 0); + DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); + return array->GetMutableValues(1, 0); + } + DCHECK_EQ(mutable_array->offset, 0); + // BOOL is allowed here only because the offset is expected to be 0, + // so the byte-aligned pointer also points to the first *bit* of the buffer. + DCHECK(is_fixed_width(type_id)); + return mutable_array->GetMutableValues(1, 0); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_internal.h b/cpp/src/arrow/util/fixed_width_internal.h new file mode 100644 index 0000000000000..f6959485fbd01 --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_internal.h @@ -0,0 +1,307 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" + +namespace arrow::compute { +// XXX: remove dependency on compute::KernelContext +class KernelContext; +} // namespace arrow::compute + +namespace arrow::util { + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// Fixed-width types are the ones defined by the is_fixed_width() predicate in +/// type_traits.h. They are all the types that passes any of the following +/// predicates: +/// +/// - is_primitive() +/// - is_fixed_size_binary() +/// - is_dictionary() +/// +/// At least 3 types in this set require special care: +/// - `Type::BOOL` is fixed-width, but it's a 1-bit type and pointers to first bit +/// in boolean buffers are not always aligned to byte boundaries. +/// - `Type::DICTIONARY` is fixed-width because the indices are fixed-width, but the +/// dictionary values are not necessarily fixed-width and have to be managed +/// by separate operations. +/// - Type::FIXED_SIZE_BINARY unlike other fixed-width types, fixed-size binary +/// values are defined by a size attribute that is not known at compile time. +/// The other types have power-of-2 byte widths, while fixed-size binary can +/// have any byte width including 0. +/// +/// Additionally, we say that a type is "fixed-width like" if it's a fixed-width as +/// defined above, or if it's a fixed-size list (or nested fixed-size lists) and +/// the innermost type is fixed-width and the following restrictions also apply: +/// - The value type of the innermost fixed-size list is not BOOL (it has to be excluded +/// because a 1-bit type doesn't byte-align) +/// - Only the top-level array may have nulls, all the inner array have to be completely +/// free of nulls so we don't need to manage internal validity bitmaps. +/// +/// Take the following `fixed_size_list, 3>` array as an +/// example: +/// +/// [ +/// [[1, 2], [3, 4], [ 5, 6]], +/// null, +/// [[7, 8], [9, 10], [11, 12]] +/// ] +/// +/// in memory, it would look like: +/// +/// { +/// type: fixed_size_list, 3>, +/// length: 3, +/// null_count: 1, +/// offset: 0, +/// buffers: [ +/// 0: [0b00000101] +/// ], +/// child_data: [ +/// 0: { +/// type: fixed_size_list, +/// length: 9, +/// null_count: 0, +/// offset: 0, +/// buffers: [0: NULL], +/// child_data: [ +/// 0: { +/// type: int32, +/// length: 18, +/// null_count: 0, +/// offset: 0, +/// buffers: [ +/// 0: NULL, +/// 1: [ 1, 2, 3, 4, 5, 6, +/// 0, 0, 0, 0, 0, 0 +/// 7, 8, 9, 10, 11, 12 ] +/// ], +/// child_data: [] +/// } +/// ] +/// } +/// ] +/// } +/// +/// This layout fits the fixed-width like definition because the innermost type +/// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't +/// have nulls. The validity bitmap is only needed at the top-level array. +/// +/// Writing to this array can be done in the same way writing to a flat fixed-width +/// array is done, by: +/// 1. Updating the validity bitmap at the top-level array if nulls are present. +/// 2. Updating a continuous fixed-width block of memory through a single pointer. +/// +/// The length of this block of memory is the product of the list sizes in the +/// `FixedSizeList` types and the byte width of the innermost fixed-width type: +/// +/// 3 * 2 * 4 = 24 bytes +/// +/// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by +/// simply setting the validity bit to 1 and writing the 24-byte sequence of +/// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. +/// +/// The length of the top-level array fully defines the lengths that all the nested +/// arrays must have, which makes defining all the lengths as easy as defining the +/// length of the top-level array. +/// +/// length = 3 +/// child_data[0].length == 3 * 3 == 9 +/// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 +/// +/// child_data[0].child_data[0].buffers[1].size() >= +/// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) +/// +/// Dealing with offsets is a bit involved. Let's say the array described above has +/// the offsets 2, 5, and 7: +/// +/// { +/// type: fixed_size_list, 3>, +/// offset: 2, +/// ... +/// child_data: [ +/// 0: { +/// type: fixed_size_list, +/// offset: 5, +/// ... +/// child_data: [ +/// 0: { +/// type: int32, +/// offset: 7, +/// buffers: [ +/// 0: NULL, +/// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped +/// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped +/// +/// 0,0,0,0,0,1, // +/// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped +/// +/// 1, 2, 3, 4, 5, 6, // +/// 0, 0, 0, 0, 0, 0 // the actual values +/// 7, 8, 9, 10, 11, 12 // +/// ] +/// ], +/// } +/// ] +/// } +/// ] +/// } +/// +/// The offset of the innermost values buffer, in bytes, is calculated as: +/// +/// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes +/// +/// In general, the formula to calculate the offset of the innermost values buffer is: +/// +/// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) +/// * sizeof(innermost_type) +/// +/// `OffsetPointerOfFixedWidthValues()` can calculate this byte offset and return the +/// pointer to the first relevant byte of the innermost values buffer. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +/// \param exclude_dictionary If true, DICTIONARY is excluded from the +/// is_fixed_width() types. Default: false. +ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count = false, + bool exclude_dictionary = false); + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// This function is a more general version of +/// `IsFixedWidthLike(const ArraySpan&, bool)` that allows the caller to further +/// restrict the inner value types that should be considered fixed-width. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +/// \param extra_predicate A DataType predicate that can be used to further +/// restrict the types that are considered fixed-width +template +inline bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, + ExtraPred extra_predicate) { + const auto* type = source.type; + // BOOL is considered fixed-width if not nested under FIXED_SIZE_LIST. + if (is_fixed_width(type->id()) && extra_predicate(*type)) { + return true; + } + if (type->id() == Type::FIXED_SIZE_LIST) { + // All the inner arrays must not contain any nulls. + const auto* values = &source.child_data[0]; + while ((force_null_count ? values->GetNullCount() : values->null_count) == 0) { + type = values->type; + if (type->id() == Type::FIXED_SIZE_LIST) { + values = &values->child_data[0]; + continue; + } + // BOOL has to be excluded because it's not byte-aligned. + return type->id() != Type::BOOL && is_fixed_width(type->id()) && + extra_predicate(*type); + } + } + return false; +} + +/// \brief Get the fixed-width in bytes of a type if it is a fixed-width like +/// type, but not BOOL. +/// +/// If the array is a FixedSizeList (of any level of nesting), the byte width of +/// the values is the product of all fixed-list sizes and the byte width of the +/// innermost fixed-width value type. +/// +/// IsFixedWidthLike(array) performs more checks than this function and should +/// be used to guarantee that, if type is not BOOL, this function will not return -1. +/// +/// NOTE: this function translates `DataType::bit_width()` to bytes differently from +/// `DataType::byte_width()`. `DataType::byte_width()` will return 0 for +/// BOOL, while this function will return `-1`. This is done because 0 is +/// a valid return value for FIXED_SIZE_LIST with size 0 or `FIXED_SIZE_BINARY` with +/// size 0. +/// +/// \pre The instance of the array where this type is from must pass +/// `IsFixedWidthLike(array)` and should not be BOOL. +/// \return The fixed-byte width of the values or -1 if the type is BOOL or not +/// fixed-width like. 0 is a valid return value as fixed-size-lists +/// and fixed-size-binary with size 0 are allowed. +ARROW_EXPORT int64_t FixedWidthInBytes(const DataType& type); + +/// \brief Get the fixed-width in bits of a type if it is a fixed-width like +/// type. +/// +/// \return The bit-width of the values or -1 +/// \see FixedWidthInBytes +ARROW_EXPORT int64_t FixedWidthInBits(const DataType& type); + +namespace internal { + +/// \brief Allocate an ArrayData for a type that is fixed-width like. +/// +/// This function performs the same checks performed by +/// `IsFixedWidthLike(source, false)`. If `source.type` is not a simple +/// fixed-width type, caller should make sure it passes the +/// `IsFixedWidthLike(source)` checks. That guarantees that it's possible to +/// allocate an array that can serve as a destination for a kernel that writes values +/// through a single pointer to fixed-width byte blocks. +/// +/// \param[in] length The length of the array to allocate (unrelated to the length of +/// the source array) +/// \param[in] source The source array that carries the type information and the +/// validity bitmaps that are relevant for the type validation +/// when the source is a FixedSizeList. +/// \see IsFixedWidthLike +ARROW_EXPORT Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, + int64_t length, + const ArraySpan& source, + bool allocate_validity, + ArrayData* out); + +} // namespace internal + +/// \brief Get the pointer to the fixed-width values of a fixed-width like array. +/// +/// This function might return NULLPTR if the type of the array is BOOL or +/// if the pre-conditions listed are not satisfied. The converse is not true +/// (i.e. not getting NULLPTR doesn't guarantee that source is a fixed-width +/// like array). +/// +/// \pre `IsFixedWidthLike(source)` or the more restrictive +/// is_fixed_width(*mutable_array->type) SHOULD be true +/// \return The pointer to the fixed-width values of an array or NULLPTR +/// if pre-conditions are not satisfied. +ARROW_EXPORT const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source); + +/// \brief Get the mutable pointer to the fixed-width values of an array +/// allocated by PreallocateFixedWidthArrayData. +/// +/// \pre mutable_array->offset and the offset of child array (if it's a +/// FixedSizeList) MUST be 0 (recursively). +/// \pre IsFixedWidthLike(ArraySpan(mutable_array)) or the more restrictive +/// is_fixed_width(*mutable_array->type) MUST be true +/// \return The mutable pointer to the fixed-width byte blocks of the array. If +/// pre-conditions are not satisfied, the return values is undefined. +ARROW_EXPORT uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array); + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_test.cc b/cpp/src/arrow/util/fixed_width_test.cc new file mode 100644 index 0000000000000..2f05221ed6535 --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_test.cc @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// #include +// #include + +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/fixed_width_internal.h" + +namespace arrow::util { + +namespace { +bool NotBool(const DataType& type) { return type.id() != Type::BOOL; } +bool NotInt32(const DataType& type) { return type.id() != Type::INT32; } +} // namespace + +class TestFixedWidth : public ::testing::Test { + protected: + std::shared_ptr bool_array_array_; + std::shared_ptr int_array_array_; + std::shared_ptr fsl_bool_array_; + std::shared_ptr fsl_int_array_; + std::shared_ptr fsl_int_nulls_array_; + std::shared_ptr fsl_int_inner_nulls_array_; + std::shared_ptr dict_string_array_; + + std::shared_ptr fsl(int32_t list_size, + const std::shared_ptr& value_type) { + return fixed_size_list(value_type, list_size); + } + + public: + void SetUp() override { + bool_array_array_ = ArrayFromJSON(boolean(), "[true, false, null]"); + int_array_array_ = ArrayFromJSON(int32(), "[1, 0, null]"); + fsl_bool_array_ = ArrayFromJSON(fsl(2, boolean()), "[[true, false]]"); + fsl_int_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3]]"); + fsl_int_nulls_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], null, [1, 2]]"); + fsl_int_inner_nulls_array_ = + ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3], [null, 2]]"); + dict_string_array_ = + ArrayFromJSON(dictionary(int32(), utf8()), R"(["Alice", "Bob", "Alice"])"); + } +}; + +TEST_F(TestFixedWidth, IsFixedWidth) { + auto arr = ArraySpan{*bool_array_array_->data()}; + // force_null_count doesn't matter because nulls at the top-level + // of the array are allowed by IsFixedWidthLike. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotInt32)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool)); + + arr = ArraySpan{*int_array_array_->data()}; + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool)); +} + +TEST_F(TestFixedWidth, IsFixedWidthLike) { + auto arr = ArraySpan{*fsl_bool_array_->data()}; + // bools wrapped by fixed-size-list are not fixed-width because the + // innermost data buffer is a bitmap and won't byte-align. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + arr = ArraySpan{*fsl_int_array_->data()}; + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + arr.null_count = kUnknownNullCount; + // force_null_count=true isn't necessary because nulls at the top-level + // of the array are allowed by IsFixedWidthLike. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + + arr.child_data[0].null_count = kUnknownNullCount; + // inner nulls are not allowed by IsFixedWidthLike... + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + // ...but forcing null counting at on every internal array increases + // the chances of IsFixedWidthLike returning true. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + // Excluding INT32 from the internal array checks. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true, NotInt32)); + + arr = ArraySpan{*fsl_int_nulls_array_->data()}; + // Nulls at the top-level of the array are allowed by IsFixedWidthLike. + // + // TODO(GH-10157): ArrayFromJSON uses FixedSizeListBuilder which currently + // produces nulls on the child data if one of the list-typed elements is null. + // ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + + arr = ArraySpan{*fsl_int_inner_nulls_array_->data()}; + // Inner nulls are not allowed by IsFixedWidthLike. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + arr = ArraySpan{*dict_string_array_->data()}; + // Dictionaries are considered fixed-width by is_fixed_width(), but excluded + // by IsFixedWidthLike if exclude_dictionary=true. + ASSERT_TRUE(IsFixedWidthLike(arr)); + ASSERT_TRUE( + IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/false)); + ASSERT_FALSE( + IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/true)); +} + +TEST_F(TestFixedWidth, MeasureWidthInBytes) { + auto b = boolean(); + auto i8 = int8(); + auto i32 = int32(); + auto fsb = fixed_size_binary(3); + auto dict = dictionary(int32(), utf8()); + auto varlen = utf8(); + ASSERT_EQ(FixedWidthInBytes(*b), -1); + ASSERT_EQ(FixedWidthInBytes(*i8), 1); + ASSERT_EQ(FixedWidthInBytes(*i32), 4); + ASSERT_EQ(FixedWidthInBytes(*fsb), 3); + ASSERT_EQ(FixedWidthInBytes(*dict), 4); + + ASSERT_EQ(FixedWidthInBytes(*varlen), -1); + ASSERT_EQ(FixedWidthInBytes(*varlen), -1); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, b)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, b)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, b)), -1); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, i8)), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, i8)), 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, i8)), 5); + ASSERT_EQ(FixedWidthInBytes(*fsl(0, i32)), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, i32)), 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, i32)), 5 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, fsb)), 5 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, dict)), 5 * 4); + + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i8))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i8))), 2 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i8))), 2 * 5); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i32))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i32))), 2 * 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i32))), 2 * 5 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, fsb))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, fsb))), 2 * 3 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, fsb))), 2 * 5 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, dict))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, dict))), 2 * 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, dict))), 2 * 5 * 4); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, varlen)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, varlen)), -1); +} + +TEST_F(TestFixedWidth, MeasureWidthInBits) { + auto b = boolean(); + auto i8 = int8(); + auto i32 = int32(); + auto fsb = fixed_size_binary(3); + auto dict = dictionary(int32(), utf8()); + auto varlen = utf8(); + ASSERT_EQ(FixedWidthInBits(*b), 1); + ASSERT_EQ(FixedWidthInBits(*i8), 8); + ASSERT_EQ(FixedWidthInBits(*i32), 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsb), 3 * 8); + ASSERT_EQ(FixedWidthInBits(*dict), 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*varlen), -1); + ASSERT_EQ(FixedWidthInBits(*varlen), -1); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), -1); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, i8)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, i8)), 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, i8)), 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(0, i32)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, i32)), 4 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, i32)), 4 * 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, fsb)), 5 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, dict)), 5 * 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i8))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i8))), 2 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i8))), 2 * 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i32))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i32))), 2 * 3 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i32))), 2 * 5 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, fsb))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, fsb))), 2 * 3 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, fsb))), 2 * 5 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, dict))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, dict))), 2 * 3 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, dict))), 2 * 5 * 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, varlen)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(2, varlen)), -1); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_test_util.h b/cpp/src/arrow/util/fixed_width_test_util.h new file mode 100644 index 0000000000000..ca141b7ca2c4d --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_test_util.h @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/builder_primitive.h" +#include "arrow/builder.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow::util::internal { + +class NestedListGenerator { + public: + /// \brief Create a nested FixedSizeListType. + /// + /// \return `fixed_size_list(fixed_size_list(..., sizes[1]), sizes[0])` + static std::shared_ptr NestedFSLType( + const std::shared_ptr& inner_type, const std::vector& sizes) { + auto type = inner_type; + for (auto it = sizes.rbegin(); it != sizes.rend(); it++) { + type = fixed_size_list(std::move(type), *it); + } + return type; + } + + /// \brief Create a nested FixedListType. + /// + /// \return `list(list(...))` + static std::shared_ptr NestedListType( + const std::shared_ptr& inner_type, size_t depth) { + auto list_type = list(inner_type); + for (size_t i = 1; i < depth; i++) { + list_type = list(std::move(list_type)); + } + return list_type; + } + + private: + template + static Status AppendNumeric(ArrayBuilder* builder, int64_t* next_value) { + using NumericBuilder = ::arrow::NumericBuilder; + using value_type = typename NumericBuilder::value_type; + auto* numeric_builder = ::arrow::internal::checked_cast(builder); + auto cast_next_value = + static_cast(*next_value % std::numeric_limits::max()); + RETURN_NOT_OK(numeric_builder->Append(cast_next_value)); + *next_value += 1; + return Status::OK(); + } + + // Append([...[[*next_inner_value++, *next_inner_value++, ...]]...]) + static Status AppendNestedList(ArrayBuilder* nested_builder, const int* list_sizes, + int64_t* next_inner_value) { + using ::arrow::internal::checked_cast; + ArrayBuilder* builder = nested_builder; + auto type = builder->type(); + if (type->id() == Type::FIXED_SIZE_LIST || type->id() == Type::LIST) { + const int list_size = *list_sizes; + if (type->id() == Type::FIXED_SIZE_LIST) { + auto* fsl_builder = checked_cast(builder); + assert(list_size == checked_cast(*type).list_size()); + RETURN_NOT_OK(fsl_builder->Append()); + builder = fsl_builder->value_builder(); + } else { // type->id() == Type::LIST) + auto* list_builder = checked_cast(builder); + RETURN_NOT_OK(list_builder->Append(/*is_valid=*/true, list_size)); + builder = list_builder->value_builder(); + } + list_sizes++; + for (int i = 0; i < list_size; i++) { + RETURN_NOT_OK(AppendNestedList(builder, list_sizes, next_inner_value)); + } + } else { + switch (type->id()) { + case Type::INT8: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT16: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT32: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT64: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + default: + return Status::NotImplemented("Unsupported type: ", *type); + } + } + return Status::OK(); + } + + static Result> NestedListArray( + ArrayBuilder* nested_builder, const std::vector& list_sizes, int64_t length) { + int64_t next_inner_value = 0; + for (int64_t i = 0; i < length; i++) { + RETURN_NOT_OK( + AppendNestedList(nested_builder, list_sizes.data(), &next_inner_value)); + } + return nested_builder->Finish(); + } + + public: + static Result> NestedFSLArray( + const std::shared_ptr& inner_type, const std::vector& list_sizes, + int64_t length) { + auto nested_type = NestedFSLType(inner_type, list_sizes); + ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type)); + return NestedListArray(builder.get(), list_sizes, length); + } + + static Result> NestedListArray( + const std::shared_ptr& inner_type, const std::vector& list_sizes, + int64_t length) { + auto nested_type = NestedListType(inner_type, list_sizes.size()); + ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type)); + return NestedListArray(builder.get(), list_sizes, length); + } + + /// \brief Generate all possible nested list configurations of depth 1 to max_depth. + /// + /// Each configuration consists of a single inner value type and a list of sizes. + /// Both can be used with NestedFSLArray and NestedListArray to generate test data. + /// + /// The product of the list sizes and the size of the inner value type is always a power + /// of 2 no greater than max_power_of_2_size. For max_depth=3 and + /// max_power_of_2_size=32, this generates 108 configurations. + /// + /// \tparam Visit a function type with signature + /// void(const std::shared_ptr& inner_type, + /// const std::vector& list_sizes) + template + static void VisitAllNestedListConfigurations( + const std::vector>& inner_value_types, Visit&& visit, + int max_depth = 3, int max_power_of_2_size = 32) { + for (int depth = 1; depth <= max_depth; depth++) { + for (auto& type : inner_value_types) { + assert(is_fixed_width(*type)); + int value_width = type->byte_width(); + + std::vector list_sizes; // stack of list sizes + auto pop = [&]() { // pop the list_sizes stack + assert(!list_sizes.empty()); + value_width /= list_sizes.back(); + list_sizes.pop_back(); + }; + auto next = [&]() { // double the top of the stack + assert(!list_sizes.empty()); + value_width *= 2; + list_sizes.back() *= 2; + return value_width; + }; + auto push_1s = [&]() { // fill the stack with 1s + while (list_sizes.size() < static_cast(depth)) { + list_sizes.push_back(1); + } + }; + + // Loop invariants: + // value_width == product(list_sizes) * type->byte_width() + // value_width is a power-of-2 (1, 2, 4, 8, 16, max_power_of_2_size=32) + push_1s(); + do { + // for (auto x : list_sizes) printf("%d * ", x); + // printf("(%s) %d = %2d\n", type->name().c_str(), type->byte_width(), + // value_width); + visit(type, list_sizes); + // Advance to the next test case + while (!list_sizes.empty()) { + if (next() <= max_power_of_2_size) { + push_1s(); + break; + } + pop(); + } + } while (!list_sizes.empty()); + } + } + } +}; + +} // namespace arrow::util::internal From 0d8b3791cda042224427ccbb4fc2fc3ec0f27b61 Mon Sep 17 00:00:00 2001 From: Jacek Stania <38670505+janosik47@users.noreply.github.com> Date: Fri, 3 May 2024 07:24:06 +0100 Subject: [PATCH 029/105] GH-35888: [Java] Add FlightStatusCode.RESOURCE_EXHAUSTED (#41508) ### Rationale for this change Related to https://github.com/apache/arrow/issues/35888 Currently the gRPC Status.RESOURCE_EXHAUSTED exception/code is translated by the Java FlightServer into FlightStatusCode.INVALID_ARGUMENT and thrown to the client as gRPC INVALID_ARGUMENT exception. That may mislead the other party as the INVALID_ARGUMENT indicates an input parameters problem where in reality the backed server intention was rather 'back off and try later'. ### What changes are included in this PR? Add the FlightStatusCode.RESOURCE_EXHAUSTED code and make sure is translated from/to the gRPC Status.RESOURCE_EXHAUSTED ### Are these changes tested? Unit tests included to validate the RESOURCE_EXHAUSTED translation between flight and grpc codes. ### Are there any user-facing changes? No. Users may start seeing RESOURCE_EXHAUSTED instead of INVALID_ARGUMENT code. In both cases this is an exception seen on the client side so I am considering this as a _not breaking change to any public API_. Although, may have an influence in the client side flows if one decided to react conditionally on exception status code. * GitHub Issue: #35888 Authored-by: Jacek Stania Signed-off-by: David Li --- .../org/apache/arrow/flight/CallStatus.java | 1 + .../apache/arrow/flight/FlightStatusCode.java | 5 +++++ .../apache/arrow/flight/grpc/StatusUtils.java | 4 +++- .../arrow/flight/grpc/TestStatusUtils.java | 22 +++++++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java index 991d0ed6a043b..8fc2002207e24 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java @@ -49,6 +49,7 @@ public class CallStatus { public static final CallStatus UNAUTHORIZED = FlightStatusCode.UNAUTHORIZED.toStatus(); public static final CallStatus UNIMPLEMENTED = FlightStatusCode.UNIMPLEMENTED.toStatus(); public static final CallStatus UNAVAILABLE = FlightStatusCode.UNAVAILABLE.toStatus(); + public static final CallStatus RESOURCE_EXHAUSTED = FlightStatusCode.RESOURCE_EXHAUSTED.toStatus(); /** * Create a new status. diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java index 3d96877ba02de..09a2c7afda106 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java @@ -71,6 +71,11 @@ public enum FlightStatusCode { * should send this code only if it has not done any work. */ UNAVAILABLE, + /** + * Some resource has been exhausted, perhaps a per-user quota, or perhaps the entire file system is out of space. + * (see: https://grpc.github.io/grpc/core/md_doc_statuscodes.html) + */ + RESOURCE_EXHAUSTED ; /** diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java index 7f0dcf2da3f0d..a2d9a85aaa442 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java @@ -74,6 +74,8 @@ public static Status.Code toGrpcStatusCode(FlightStatusCode code) { return Code.UNIMPLEMENTED; case UNAVAILABLE: return Code.UNAVAILABLE; + case RESOURCE_EXHAUSTED: + return Code.RESOURCE_EXHAUSTED; default: return Code.UNKNOWN; } @@ -101,7 +103,7 @@ public static FlightStatusCode fromGrpcStatusCode(Status.Code code) { case PERMISSION_DENIED: return FlightStatusCode.UNAUTHORIZED; case RESOURCE_EXHAUSTED: - return FlightStatusCode.INVALID_ARGUMENT; + return FlightStatusCode.RESOURCE_EXHAUSTED; case FAILED_PRECONDITION: return FlightStatusCode.INVALID_ARGUMENT; case ABORTED: diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java index 9912a26ea340a..730ea30a2f598 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java @@ -48,4 +48,26 @@ public void testParseTrailers() { Assertions.assertTrue(callStatus.metadata().containsKey("content-type")); Assertions.assertEquals("text/html", callStatus.metadata().get("content-type")); } + + @Test + public void testGrpcResourceExhaustedTranslatedToFlightStatus() { + Status status = Status.RESOURCE_EXHAUSTED; + + CallStatus callStatus = StatusUtils.fromGrpcStatus(status); + Assertions.assertEquals(FlightStatusCode.RESOURCE_EXHAUSTED, callStatus.code()); + + FlightStatusCode flightStatusCode = StatusUtils.fromGrpcStatusCode(status.getCode()); + Assertions.assertEquals(FlightStatusCode.RESOURCE_EXHAUSTED, flightStatusCode); + } + + @Test + public void testFlightResourceExhaustedTranslatedToGrpcStatua() { + CallStatus callStatus = CallStatus.RESOURCE_EXHAUSTED; + + Status.Code grpcStatusCode = StatusUtils.toGrpcStatusCode(callStatus.code()); + Assertions.assertEquals(Status.RESOURCE_EXHAUSTED.getCode(), grpcStatusCode); + + Status grpcStatus = StatusUtils.toGrpcStatus(callStatus); + Assertions.assertEquals(Status.RESOURCE_EXHAUSTED.getCode(), grpcStatus.getCode()); + } } From 2eb47efc04707145edbf6a2ad8a750138a585682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JB=20Onofr=C3=A9?= Date: Fri, 3 May 2024 08:44:41 +0200 Subject: [PATCH 030/105] MINOR: Increase the open-pull-requests-limit for dependabot (#41499) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change By default, dependabot opens a maximum of five pull requests for version updates. Once there are five open pull requests from dependabot, dependabot will not open any new requests until some of those open requests are merged or closed. With this change, dependabot can open up to 50 pull requests for Maven, and 10 pull requests for other systems. ### What changes are included in this PR? Update configuration for dependabot. ### Are these changes tested? Tested on other ASF projects 😄 ### Are there any user-facing changes? No Authored-by: JB Onofré Signed-off-by: David Li --- .github/dependabot.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index e96cb8d2eb1e3..7d9ff2f42e887 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,30 +23,35 @@ updates: interval: "weekly" commit-message: prefix: "MINOR: [CI] " + open-pull-requests-limit: 10 - package-ecosystem: "gomod" directory: "/go/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Go] " + open-pull-requests-limit: 10 - package-ecosystem: "maven" directory: "/java/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Java] " + open-pull-requests-limit: 10 - package-ecosystem: "npm" directory: "/js/" schedule: interval: "monthly" commit-message: prefix: "MINOR: [JS] " + open-pull-requests-limit: 10 - package-ecosystem: "nuget" directory: "/csharp/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [C#] " + open-pull-requests-limit: 10 ignore: - dependency-name: "Microsoft.Extensions.*" update-types: From c0aade5f624e2ec64d5d8743df9b95c7eee50117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 3 May 2024 12:16:02 +0200 Subject: [PATCH 031/105] GH-41462: [CI] Temporary pin azurite to v3.29.0 (#41501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change install_azurite.sh is failing to install the latest version of Azurite and azure tests were failing. ### What changes are included in this PR? Temporarily pin azurite to v3.29.0 to unblock 16.1.0 release. A follow up issue is tracked here: https://github.com/apache/arrow/issues/41505 ### Are these changes tested? Yes via archery ### Are there any user-facing changes? No * GitHub Issue: #41462 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- ci/scripts/install_azurite.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh index 2e7008360fdc3..dda5e99405b7f 100755 --- a/ci/scripts/install_azurite.sh +++ b/ci/scripts/install_azurite.sh @@ -19,17 +19,18 @@ set -e +# Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505 case "$(uname)" in Darwin) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; MINGW*) choco install nodejs.install - npm install -g azurite + npm install -g azurite@v3.29.0 ;; Linux) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; esac From c8cf61c569886ff18c88e29c447a98ab6dedbd92 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 3 May 2024 09:42:28 -0400 Subject: [PATCH 032/105] GH-41385: [CI][MATLAB][Packaging] Add support for MATLAB `R2024a` in CI and crossbow packaging workflows (#41504) ### Rationale for this change MATLAB `R2024a` is the latest available version of MATLAB as of April 2024. We are currently building against MATLAB `R2023a` in CI and for the crossbow packaging workflow. We should update the version of MATLAB we support to the latest available version. We previously created an issue to use `R2023b` (https://github.com/apache/arrow/issues/37809). However, `R2024a` has become publicly available since then. ### What changes are included in this PR? 1. Changed the `release` argument supplied to `matlab-actions/setup-matlab@ v2` to `R2024a` from `R2023a` in `.github/workflows/matlab.yml` and `dev/tasks/matlab/github.yml`. 2. Updated the script used to package the MLTBX file (`arrow/matlab/tools/packageMatlabInterface.m`) to specify the version of MATLAB currently running as the only compatible version of MATLAB for the interface. 3. Updated display tests to react to a change in how class names are formatted when an objected is displayed in the Command Window. ### Are these changes tested? Yes. Existing tests used. ### Are there any user-facing changes? There are no changes to the MATLAB Arrow Interface. However, the MATLAB release used to build, test, and package the interface has been upgraded. ### Future Work 1. We may want to consider building and packaging against multiple versions of MATLAB in parallel, rather than just the latest. This would require some more thought on how many releases back to support over time, however. 2. #41435 * GitHub Issue: #41385 Lead-authored-by: Sarah Gilmore Co-authored-by: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Co-authored-by: Sutou Kouhei Signed-off-by: Sarah Gilmore --- .github/workflows/matlab.yml | 6 +++--- dev/tasks/matlab/github.yml | 8 ++++---- .../+internal/+test/+display/makeLinkString.m | 4 ++-- .../+tabular/+internal/+display/getSchemaString.m | 2 +- matlab/tools/packageMatlabInterface.m | 15 ++++++++++++--- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index aa3692e587961..8a0de8a365661 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -70,7 +70,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: sudo apt-get install ccache - name: Setup ccache @@ -110,7 +110,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: brew install ccache - name: Setup ccache @@ -148,7 +148,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index 13fa36b501125..7840fd176705c 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -31,7 +31,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -68,7 +68,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -103,7 +103,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Install sccache shell: bash run: arrow/ci/scripts/install_sccache.sh pc-windows-msvc $(pwd)/sccache @@ -149,7 +149,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Run commands env: MATLABPATH: arrow/matlab/tools diff --git a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m index 79065ba1c8cfd..e99dd7d78488d 100644 --- a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m +++ b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m @@ -26,11 +26,11 @@ end if opts.BoldFont - link = compose("%s", ... opts.FullClassName, opts.ClassName); else - link = compose("%s", ... + link = compose("%s", ... opts.FullClassName, opts.ClassName); end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m index 7da945ca993ef..724b4873c92e1 100644 --- a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m @@ -43,7 +43,7 @@ classNameAndIDs = strings([1 numel(typeIDs) * 2]); classNameAndIDs(1:2:end-1) = classNames; classNameAndIDs(2:2:end) = typeIDs; - typeIDs = compose("%s", classNameAndIDs); + typeIDs = compose("%s", classNameAndIDs); end text = names + ": " + typeIDs; diff --git a/matlab/tools/packageMatlabInterface.m b/matlab/tools/packageMatlabInterface.m index 55b4d4241a569..3d970002614ab 100644 --- a/matlab/tools/packageMatlabInterface.m +++ b/matlab/tools/packageMatlabInterface.m @@ -55,9 +55,18 @@ opts.SupportedPlatforms.Glnxa64 = true; opts.SupportedPlatforms.MatlabOnline = true; -% Interface is only qualified against R2023a at the moment -opts.MinimumMatlabRelease = "R2023a"; -opts.MaximumMatlabRelease = "R2023a"; +% MEX files use run-time libraries shipped with MATLAB (e.g. libmx, libmex, +% etc.). MEX files linked against earlier versions of MALTAB run-time libraries +% will most likely work on newer versions of MATLAB. However, this may not +% always be the case. +% +% For now, set the earliest and latest compatible releases of MATLAB to +% the release of MATLAB used to build and package the MATLAB Arrow Interface. +% +% See: https://www.mathworks.com/help/matlab/matlab_external/version-compatibility.html +currentRelease = matlabRelease.Release; +opts.MinimumMatlabRelease = currentRelease; +opts.MaximumMatlabRelease = currentRelease; opts.OutputFile = fullfile(outputFolder, compose("matlab-arrow-%s.mltbx", toolboxVersionRaw)); disp("Output File: " + opts.OutputFile); From 32916f8355858ccea91df402f67696953f9dd298 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:06:36 +0900 Subject: [PATCH 033/105] MINOR: [JS] Bump @swc/core from 1.4.14 to 1.4.17 in /js (#41519) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [@ swc/core](https://github.com/swc-project/swc) from 1.4.14 to 1.4.17.
Changelog

Sourced from @​swc/core's changelog.

[1.4.17] - 2024-04-23

Bug Fixes

  • (es) Ignore sourceMappingURL in string literals (#8879) (d7188cd)

  • (es/codegen) Use Str.raw for es5 (#8873) (c7a06b1)

  • (es/compat) Fix async generator (#8881) (063eabd)

  • (es/resolver) Prioritze jsc.paths by length in tsc resolver (#8875) (e22c368)

  • (html/codegen) Expand elements before which body isn’t elided (#8877) (5419a94)

[1.4.16] - 2024-04-18

Bug Fixes

  • (es/helpers) Fix resolving of usingCtx helper (#8874) (6e9d1a4)

[1.4.15] - 2024-04-17

Bug Fixes

  • (es/codegen) Fix ascii_only for identifiers (#8866) (2075a23)

  • (es/minifier) Remove raw of strings after modification (#8865) (740c0bb)

  • (es/parser) Fix span of BindingIdent (#8859) (fbd32fb)

  • (es/proposal) Update explicit resource management to match spec (#8860) (6d24076)

Features

  • (es/transforms) Allocate stacks dynamically (#8867) (a1c5415)

... (truncated)

Commits
  • 3311da7 chore: Publish 1.4.17 with swc_core v0.90.37
  • b1c22d5 chore: Improve publish script
  • fedf06f chore: Publish 1.4.17-nightly-20240423.3
  • 2c1e959 chore: Publish 1.4.17-nightly-20240423.2
  • 7b08d38 chore: Update bindings
  • 2fbb864 chore: Publish 1.4.17-nightly-20240423.1
  • 6d3c41d chore: Bump crates
  • c7a06b1 fix(es/codegen): Use Str.raw for es5 (#8873)
  • f5e50c2 chore: Bump crates
  • 063eabd fix(es/compat): Fix async generator (#8881)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ swc/core&package-manager=npm_and_yarn&previous-version=1.4.14&new-version=1.4.17)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 128 ++++++++++++++++++++++++------------------------ 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/js/package.json b/js/package.json index e9590a188820f..08f2fe83d13e8 100644 --- a/js/package.json +++ b/js/package.json @@ -67,7 +67,7 @@ "@rollup/plugin-alias": "5.1.0", "@rollup/plugin-node-resolve": "15.2.3", "@rollup/stream": "3.0.1", - "@swc/core": "1.4.14", + "@swc/core": "1.4.17", "@types/benchmark": "2.1.5", "@types/glob": "8.1.0", "@types/jest": "29.5.12", diff --git a/js/yarn.lock b/js/yarn.lock index ab092675b4806..47674bd8b2168 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1124,74 +1124,74 @@ dependencies: "@sinonjs/commons" "^3.0.0" -"@swc/core-darwin-arm64@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-darwin-arm64/-/core-darwin-arm64-1.4.14.tgz#de570252c3f155f55536f0d6bb8bafaec2e99616" - integrity sha512-8iPfLhYNspBl836YYsfv6ErXwDUqJ7IMieddV3Ey/t/97JAEAdNDUdtTKDtbyP0j/Ebyqyn+fKcqwSq7rAof0g== - -"@swc/core-darwin-x64@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-darwin-x64/-/core-darwin-x64-1.4.14.tgz#4eefbe129e416f4c400656742ab7f30e01aff02e" - integrity sha512-9CqSj8uRZ92cnlgAlVaWMaJJBdxtNvCzJxaGj5KuIseeG6Q0l1g+qk8JcU7h9dAsH9saHTNwNFBVGKQo0W0ujg== - -"@swc/core-linux-arm-gnueabihf@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.4.14.tgz#bea4b94c32bb25de2816126dac299655529ba7f3" - integrity sha512-mfd5JArPITTzMjcezH4DwMw+BdjBV1y25Khp8itEIpdih9ei+fvxOOrDYTN08b466NuE2dF2XuhKtRLA7fXArQ== - -"@swc/core-linux-arm64-gnu@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.4.14.tgz#52063214f4a14d6a0c3c6059ed9e7ba1062f6b46" - integrity sha512-3Lqlhlmy8MVRS9xTShMaPAp0oyUt0KFhDs4ixJsjdxKecE0NJSV/MInuDmrkij1C8/RQ2wySRlV9np5jK86oWw== - -"@swc/core-linux-arm64-musl@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.4.14.tgz#7e7deea7b1b3d0c9944cc8e9ba948fcc785158ea" - integrity sha512-n0YoCa64TUcJrbcXIHIHDWQjdUPdaXeMHNEu7yyBtOpm01oMGTKP3frsUXIABLBmAVWtKvqit4/W1KVKn5gJzg== - -"@swc/core-linux-x64-gnu@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.4.14.tgz#301133ea3ee347568886f2489837e991e96d44db" - integrity sha512-CGmlwLWbfG1dB4jZBJnp2IWlK5xBMNLjN7AR5kKA3sEpionoccEnChOEvfux1UdVJQjLRKuHNV9yGyqGBTpxfQ== - -"@swc/core-linux-x64-musl@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.4.14.tgz#86b8e987a814209cd0dd0f21cbc1134305dfffd5" - integrity sha512-xq4npk8YKYmNwmr8fbvF2KP3kUVdZYfXZMQnW425gP3/sn+yFQO8Nd0bGH40vOVQn41kEesSe0Z5O/JDor2TgQ== - -"@swc/core-win32-arm64-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.4.14.tgz#eb56b8977e3542665929c3963bd7dc18fe5b2556" - integrity sha512-imq0X+gU9uUe6FqzOQot5gpKoaC00aCUiN58NOzwp0QXEupn8CDuZpdBN93HiZswfLruu5jA1tsc15x6v9p0Yg== - -"@swc/core-win32-ia32-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.4.14.tgz#72e119038b9d8743b13bb933b8e192acd9f501f9" - integrity sha512-cH6QpXMw5D3t+lpx6SkErHrxN0yFzmQ0lgNAJxoDRiaAdDbqA6Col8UqUJwUS++Ul6aCWgNhCdiEYehPaoyDPA== - -"@swc/core-win32-x64-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.4.14.tgz#f5a3b1a241708b0628a07458e5bedbf67a1b9595" - integrity sha512-FmZ4Tby4wW65K/36BKzmuu7mlq7cW5XOxzvufaSNVvQ5PN4OodAlqPjToe029oma4Av+ykJiif64scMttyNAzg== - -"@swc/core@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core/-/core-1.4.14.tgz#8bad316c0119f626bb1b181ba7a988ef9d14e9cc" - integrity sha512-tHXg6OxboUsqa/L7DpsCcFnxhLkqN/ht5pCwav1HnvfthbiNIJypr86rNx4cUnQDJepETviSqBTIjxa7pSpGDQ== +"@swc/core-darwin-arm64@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-darwin-arm64/-/core-darwin-arm64-1.4.17.tgz#e62fa7f247bdd1c0c50a3f99722da4dd098c7c67" + integrity sha512-HVl+W4LezoqHBAYg2JCqR+s9ife9yPfgWSj37iIawLWzOmuuJ7jVdIB7Ee2B75bEisSEKyxRlTl6Y1Oq3owBgw== + +"@swc/core-darwin-x64@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-darwin-x64/-/core-darwin-x64-1.4.17.tgz#1145cbb7575e317204ed3a7d0274bd26fe9ffab6" + integrity sha512-WYRO9Fdzq4S/he8zjW5I95G1zcvyd9yyD3Tgi4/ic84P5XDlSMpBDpBLbr/dCPjmSg7aUXxNQqKqGkl6dQxYlA== + +"@swc/core-linux-arm-gnueabihf@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.4.17.tgz#7145b3ada5cf9b748eaacbc9a7c7037ba0fb26bb" + integrity sha512-cgbvpWOvtMH0XFjvwppUCR+Y+nf6QPaGu6AQ5hqCP+5Lv2zO5PG0RfasC4zBIjF53xgwEaaWmGP5/361P30X8Q== + +"@swc/core-linux-arm64-gnu@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.4.17.tgz#5c0833ef132af17bd3cbdf2253f35b57c0cf62bb" + integrity sha512-l7zHgaIY24cF9dyQ/FOWbmZDsEj2a9gRFbmgx2u19e3FzOPuOnaopFj0fRYXXKCmtdx+anD750iBIYnTR+pq/Q== + +"@swc/core-linux-arm64-musl@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.4.17.tgz#5bfe81eb23c905f04b669a7d2b060a147a263483" + integrity sha512-qhH4gr9gAlVk8MBtzXbzTP3BJyqbAfUOATGkyUtohh85fPXQYuzVlbExix3FZXTwFHNidGHY8C+ocscI7uDaYw== + +"@swc/core-linux-x64-gnu@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.4.17.tgz#a0c19bc9635e86ebd1c7f8e9e026503d1a1bf83d" + integrity sha512-vRDFATL1oN5oZMImkwbgSHEkp8xG1ofEASBypze01W1Tqto8t+yo6gsp69wzCZBlxldsvPpvFZW55Jq0Rn+UnA== + +"@swc/core-linux-x64-musl@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.4.17.tgz#2179b9536235a3b02a46997ddb1c178dfadf1667" + integrity sha512-zQNPXAXn3nmPqv54JVEN8k2JMEcMTQ6veVuU0p5O+A7KscJq+AGle/7ZQXzpXSfUCXlLMX4wvd+rwfGhh3J4cw== + +"@swc/core-win32-arm64-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.4.17.tgz#3004a431c836c6b16b4660ea2425dde467a8ee36" + integrity sha512-z86n7EhOwyzxwm+DLE5NoLkxCTme2lq7QZlDjbQyfCxOt6isWz8rkW5QowTX8w9Rdmk34ncrjSLvnHOeLY17+w== + +"@swc/core-win32-ia32-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.4.17.tgz#59155485d5307fb2a267e5acb215e0f440b6f48f" + integrity sha512-JBwuSTJIgiJJX6wtr4wmXbfvOswHFj223AumUrK544QV69k60FJ9q2adPW9Csk+a8wm1hLxq4HKa2K334UHJ/g== + +"@swc/core-win32-x64-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.4.17.tgz#b98f25fc277fb0e319f25f9fd00a82023662716b" + integrity sha512-jFkOnGQamtVDBm3MF5Kq1lgW8vx4Rm1UvJWRUfg+0gx7Uc3Jp3QMFeMNw/rDNQYRDYPG3yunCC+2463ycd5+dg== + +"@swc/core@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core/-/core-1.4.17.tgz#3ea4180fa5c54282b284006a6de1263ef1cf887f" + integrity sha512-tq+mdWvodMBNBBZbwFIMTVGYHe9N7zvEaycVVjfvAx20k1XozHbHhRv+9pEVFJjwRxLdXmtvFZd3QZHRAOpoNQ== dependencies: "@swc/counter" "^0.1.2" "@swc/types" "^0.1.5" optionalDependencies: - "@swc/core-darwin-arm64" "1.4.14" - "@swc/core-darwin-x64" "1.4.14" - "@swc/core-linux-arm-gnueabihf" "1.4.14" - "@swc/core-linux-arm64-gnu" "1.4.14" - "@swc/core-linux-arm64-musl" "1.4.14" - "@swc/core-linux-x64-gnu" "1.4.14" - "@swc/core-linux-x64-musl" "1.4.14" - "@swc/core-win32-arm64-msvc" "1.4.14" - "@swc/core-win32-ia32-msvc" "1.4.14" - "@swc/core-win32-x64-msvc" "1.4.14" + "@swc/core-darwin-arm64" "1.4.17" + "@swc/core-darwin-x64" "1.4.17" + "@swc/core-linux-arm-gnueabihf" "1.4.17" + "@swc/core-linux-arm64-gnu" "1.4.17" + "@swc/core-linux-arm64-musl" "1.4.17" + "@swc/core-linux-x64-gnu" "1.4.17" + "@swc/core-linux-x64-musl" "1.4.17" + "@swc/core-win32-arm64-msvc" "1.4.17" + "@swc/core-win32-ia32-msvc" "1.4.17" + "@swc/core-win32-x64-msvc" "1.4.17" "@swc/counter@^0.1.2", "@swc/counter@^0.1.3": version "0.1.3" From b102aebb0a9eb05e94544fd1f15f85f660fc89be Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:17:41 +0900 Subject: [PATCH 034/105] MINOR: [JS] Bump @typescript-eslint/parser from 7.7.0 to 7.8.0 in /js (#41522) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [@ typescript-eslint/parser](https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/parser) from 7.7.0 to 7.8.0.
Release notes

Sourced from @​typescript-eslint/parser's releases.

v7.8.0

7.8.0 (2024-04-29)

🚀 Features

  • rule-tester: assert suggestion messages are unique (#8995)
  • typescript-estree: add maximumDefaultProjectFileMatchCount and wide allowDefaultProjectForFiles glob restrictions (#8925)

🩹 Fixes

  • eslint-plugin: [no-unsafe-argument] handle tagged templates (#8746)
  • eslint-plugin: [prefer-optional-chain] suggests optional chaining during strict null equality check (#8717)
  • eslint-plugin: [consistent-type-assertions] handle tagged templates (#8993)
  • eslint-plugin: [no-unsafe-return] handle union types (#9001)
  • eslint-plugin: [no-unused-vars] clear error report range (#8640)
  • utils: export ESLint backwards-compat functions (#8976)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

v7.7.1

7.7.1 (2024-04-22)

🩹 Fixes

  • eslint-plugin: [no-unsafe-assignment] handle shorthand property assignment (#8800)
  • eslint-plugin: [explicit-function-return-type] fix checking wrong ancestor's return type (#8809)
  • eslint-plugin: [prefer-optional-chain] only look at left operand for requireNullish (#8559)
  • eslint-plugin: [no-for-in-array] refine report location (#8874)
  • eslint-plugin: [no-unnecessary-type-assertion] allow non-null assertion for void type (#8912)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

Changelog

Sourced from @​typescript-eslint/parser's changelog.

7.8.0 (2024-04-29)

This was a version bump only for parser to align it with other projects, there were no code changes.

You can read about our versioning strategy and releases on our website.

7.7.1 (2024-04-22)

This was a version bump only for parser to align it with other projects, there were no code changes.

You can read about our versioning strategy and releases on our website.

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ typescript-eslint/parser&package-manager=npm_and_yarn&previous-version=7.7.0&new-version=7.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 53 +++++++++---------------------------------------- 2 files changed, 10 insertions(+), 45 deletions(-) diff --git a/js/package.json b/js/package.json index 08f2fe83d13e8..f6d3c70fca6aa 100644 --- a/js/package.json +++ b/js/package.json @@ -73,7 +73,7 @@ "@types/jest": "29.5.12", "@types/multistream": "4.1.3", "@typescript-eslint/eslint-plugin": "7.8.0", - "@typescript-eslint/parser": "7.7.0", + "@typescript-eslint/parser": "7.8.0", "async-done": "2.0.0", "benny": "3.7.1", "cross-env": "7.0.3", diff --git a/js/yarn.lock b/js/yarn.lock index 47674bd8b2168..9885be2ba0643 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1453,15 +1453,15 @@ semver "^7.6.0" ts-api-utils "^1.3.0" -"@typescript-eslint/parser@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/parser/-/parser-7.7.0.tgz#6b1b3ce76c5de002c43af8ae933613b0f2b4bcc6" - integrity sha512-fNcDm3wSwVM8QYL4HKVBggdIPAy9Q41vcvC/GtDobw3c4ndVT3K6cqudUmjHPw8EAp4ufax0o58/xvWaP2FmTg== - dependencies: - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/typescript-estree" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" +"@typescript-eslint/parser@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/parser/-/parser-7.8.0.tgz#1e1db30c8ab832caffee5f37e677dbcb9357ddc8" + integrity sha512-KgKQly1pv0l4ltcftP59uQZCi4HUYswCLbTqVZEJu7uLX8CTLyswqMLqLN+2QFz4jCptqWVV4SB7vdxcH2+0kQ== + dependencies: + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/typescript-estree" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" "@typescript-eslint/scope-manager@5.62.0": @@ -1472,14 +1472,6 @@ "@typescript-eslint/types" "5.62.0" "@typescript-eslint/visitor-keys" "5.62.0" -"@typescript-eslint/scope-manager@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.7.0.tgz#3f0db079b275bb8b0cb5be7613fb3130cfb5de77" - integrity sha512-/8INDn0YLInbe9Wt7dK4cXLDYp0fNHP5xKLHvZl3mOT5X17rK/YShXaiNmorl+/U4VKCVIjJnx4Ri5b0y+HClw== - dependencies: - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" - "@typescript-eslint/scope-manager@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.8.0.tgz#bb19096d11ec6b87fb6640d921df19b813e02047" @@ -1503,11 +1495,6 @@ resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-5.62.0.tgz#258607e60effa309f067608931c3df6fed41fd2f" integrity sha512-87NVngcbVXUahrRTqIK27gD2t5Cu1yuCXxbLcFtCzZGlfyVWWh8mLHkoxzjsB6DDNnvdL+fW8MiwPEJyGJQDgQ== -"@typescript-eslint/types@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.7.0.tgz#23af4d24bf9ce15d8d301236e3e3014143604f27" - integrity sha512-G01YPZ1Bd2hn+KPpIbrAhEWOn5lQBrjxkzHkWvP6NucMXFtfXoevK82hzQdpfuQYuhkvFDeQYbzXCjR1z9Z03w== - "@typescript-eslint/types@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.8.0.tgz#1fd2577b3ad883b769546e2d1ef379f929a7091d" @@ -1526,20 +1513,6 @@ semver "^7.3.7" tsutils "^3.21.0" -"@typescript-eslint/typescript-estree@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.7.0.tgz#b5dd6383b4c6a852d7b256a37af971e8982be97f" - integrity sha512-8p71HQPE6CbxIBy2kWHqM1KGrC07pk6RJn40n0DSc6bMOBBREZxSDJ+BmRzc8B5OdaMh1ty3mkuWRg4sCFiDQQ== - dependencies: - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" - debug "^4.3.4" - globby "^11.1.0" - is-glob "^4.0.3" - minimatch "^9.0.4" - semver "^7.6.0" - ts-api-utils "^1.3.0" - "@typescript-eslint/typescript-estree@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.8.0.tgz#b028a9226860b66e623c1ee55cc2464b95d2987c" @@ -1589,14 +1562,6 @@ "@typescript-eslint/types" "5.62.0" eslint-visitor-keys "^3.3.0" -"@typescript-eslint/visitor-keys@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.7.0.tgz#950148cf1ac11562a2d903fdf7acf76714a2dc9e" - integrity sha512-h0WHOj8MhdhY8YWkzIF30R379y0NqyOHExI9N9KCzvmu05EgG4FumeYa3ccfKUSphyWkWQE1ybVrgz/Pbam6YA== - dependencies: - "@typescript-eslint/types" "7.7.0" - eslint-visitor-keys "^3.4.3" - "@typescript-eslint/visitor-keys@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.8.0.tgz#7285aab991da8bee411a42edbd5db760d22fdd91" From 5959024e9c1094c096dee569c0a5016496b24b58 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:18:13 +0900 Subject: [PATCH 035/105] MINOR: [JS] Bump eslint-plugin-jest from 27.9.0 to 28.4.0 in /js (#41524) Bumps [eslint-plugin-jest](https://github.com/jest-community/eslint-plugin-jest) from 27.9.0 to 28.4.0.
Release notes

Sourced from eslint-plugin-jest's releases.

v28.4.0

28.4.0 (2024-05-03)

Features

  • valid-expect: supporting automatically fixing missing await in some cases (#1574) (a407098)

v28.3.0

28.3.0 (2024-04-27)

Features

  • prefer importing jest globals for specific types (#1568) (c464ae3)

v28.2.0

28.2.0 (2024-04-06)

Features

  • support providing aliases for @ jest/globals package (#1543) (744d4f6)

v28.1.1

28.1.1 (2024-04-06)

Bug Fixes

  • max-expects: properly reset counter when exiting a test case (#1550) (b4b7cbc)

v28.1.0

28.1.0 (2024-04-06)

Features

v28.0.0

28.0.0 (2024-04-06)

Bug Fixes

  • allow ESLint 9 as peer dependency (#1547) (3c5e167)
  • drop support for Node 19 (#1548) (c87e388)
  • no-large-snapshots: avoid instanceof RegExp check for ESLint v9 compatibility (#1542) (af4a9c9)

... (truncated)

Changelog

Sourced from eslint-plugin-jest's changelog.

28.4.0 (2024-05-03)

Features

  • valid-expect: supporting automatically fixing missing await in some cases (#1574) (a407098)

28.3.0 (2024-04-27)

Features

  • prefer importing jest globals for specific types (#1568) (c464ae3)

28.2.0 (2024-04-06)

Features

  • support providing aliases for @ jest/globals package (#1543) (744d4f6)

28.1.1 (2024-04-06)

Bug Fixes

  • max-expects: properly reset counter when exiting a test case (#1550) (b4b7cbc)

28.1.0 (2024-04-06)

Features

28.0.0 (2024-04-06)

Bug Fixes

  • allow ESLint 9 as peer dependency (#1547) (3c5e167)
  • drop support for Node 19 (#1548) (c87e388)
  • no-large-snapshots: avoid instanceof RegExp check for ESLint v9 compatibility (#1542) (af4a9c9)

Features

... (truncated)

Commits
  • 27f7e74 chore(release): 28.4.0 [skip ci]
  • a407098 feat(valid-expect): supporting automatically fixing missing await in some c...
  • f47cc3c refactor: remove unneeded as consts (#1578)
  • 6c1f921 refactor(prefer-lowercase-title): remove unneeded cast (#1577)
  • aac5f03 refactor(prefer-importing-jest-globals): use AST_NODE_TYPES constant instea...
  • df3202f chore(deps): update yarn to v3.8.2 (#1575)
  • 8001fe7 chore(deps): lock file maintenance
  • bd6b918 chore(release): 28.3.0 [skip ci]
  • c464ae3 feat: prefer importing jest globals for specific types (#1568)
  • 2f21f33 refactor(expect-expect): remove unneeded array (#1571)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=eslint-plugin-jest&package-manager=npm_and_yarn&previous-version=27.9.0&new-version=28.4.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 115 +++++++++++++++++++++++------------------------- 2 files changed, 56 insertions(+), 61 deletions(-) diff --git a/js/package.json b/js/package.json index f6d3c70fca6aa..7ed0daddfada0 100644 --- a/js/package.json +++ b/js/package.json @@ -82,7 +82,7 @@ "esbuild": "0.20.2", "esbuild-plugin-alias": "0.2.1", "eslint": "8.57.0", - "eslint-plugin-jest": "27.9.0", + "eslint-plugin-jest": "28.4.0", "eslint-plugin-unicorn": "52.0.0", "esm": "https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz", "gulp": "4.0.2", diff --git a/js/yarn.lock b/js/yarn.lock index 9885be2ba0643..dbf79115d6412 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1348,7 +1348,7 @@ expect "^29.0.0" pretty-format "^29.0.0" -"@types/json-schema@*", "@types/json-schema@^7.0.15", "@types/json-schema@^7.0.8", "@types/json-schema@^7.0.9": +"@types/json-schema@*", "@types/json-schema@^7.0.12", "@types/json-schema@^7.0.15", "@types/json-schema@^7.0.8": version "7.0.15" resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841" integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA== @@ -1399,7 +1399,7 @@ resolved "https://registry.yarnpkg.com/@types/resolve/-/resolve-1.20.2.tgz#97d26e00cd4a0423b4af620abecf3e6f442b7975" integrity sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q== -"@types/semver@^7.3.12", "@types/semver@^7.5.8": +"@types/semver@^7.5.0", "@types/semver@^7.5.8": version "7.5.8" resolved "https://registry.yarnpkg.com/@types/semver/-/semver-7.5.8.tgz#8268a8c57a3e4abd25c165ecd36237db7948a55e" integrity sha512-I8EUhyrgfLrcTkzV3TSsGyl1tSuPrEDzr0yd5m90UgNxQkyDXULk3b6MlQqTCpZpNtWe1K0hzclnZkTcLBe2UQ== @@ -1464,13 +1464,13 @@ "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" -"@typescript-eslint/scope-manager@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-5.62.0.tgz#d9457ccc6a0b8d6b37d0eb252a23022478c5460c" - integrity sha512-VXuvVvZeQCQb5Zgf4HAxc04q5j+WrNAtNh9OwCsCgpKqESMTu3tF/jhZ3xG6T4NZwWl65Bg8KuS2uEvhSfLl0w== +"@typescript-eslint/scope-manager@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-6.21.0.tgz#ea8a9bfc8f1504a6ac5d59a6df308d3a0630a2b1" + integrity sha512-OwLUIWZJry80O99zvqXVEioyniJMa+d2GrqpUTqi5/v5D5rOrppJVBPa0yKCblcigC0/aYAzxxqQ1B+DS2RYsg== dependencies: - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/visitor-keys" "5.62.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/visitor-keys" "6.21.0" "@typescript-eslint/scope-manager@7.8.0": version "7.8.0" @@ -1490,28 +1490,29 @@ debug "^4.3.4" ts-api-utils "^1.3.0" -"@typescript-eslint/types@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-5.62.0.tgz#258607e60effa309f067608931c3df6fed41fd2f" - integrity sha512-87NVngcbVXUahrRTqIK27gD2t5Cu1yuCXxbLcFtCzZGlfyVWWh8mLHkoxzjsB6DDNnvdL+fW8MiwPEJyGJQDgQ== +"@typescript-eslint/types@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-6.21.0.tgz#205724c5123a8fef7ecd195075fa6e85bac3436d" + integrity sha512-1kFmZ1rOm5epu9NZEZm1kckCDGj5UJEf7P1kliH4LKu/RkwpsfqqGmY2OOcUs18lSlQBKLDYBOGxRVtrMN5lpg== "@typescript-eslint/types@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.8.0.tgz#1fd2577b3ad883b769546e2d1ef379f929a7091d" integrity sha512-wf0peJ+ZGlcH+2ZS23aJbOv+ztjeeP8uQ9GgwMJGVLx/Nj9CJt17GWgWWoSmoRVKAX2X+7fzEnAjxdvK2gqCLw== -"@typescript-eslint/typescript-estree@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-5.62.0.tgz#7d17794b77fabcac615d6a48fb143330d962eb9b" - integrity sha512-CmcQ6uY7b9y694lKdRB8FEel7JbU/40iSAPomu++SjLMntB+2Leay2LO6i8VnJk58MtE9/nQSFIH6jpyRWyYzA== +"@typescript-eslint/typescript-estree@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-6.21.0.tgz#c47ae7901db3b8bddc3ecd73daff2d0895688c46" + integrity sha512-6npJTkZcO+y2/kr+z0hc4HwNfrrP4kNYh57ek7yCNlrBjWQ1Y0OS7jiZTkgumrvkX5HkEKXFZkkdFNkaW2wmUQ== dependencies: - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/visitor-keys" "5.62.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/visitor-keys" "6.21.0" debug "^4.3.4" globby "^11.1.0" is-glob "^4.0.3" - semver "^7.3.7" - tsutils "^3.21.0" + minimatch "9.0.3" + semver "^7.5.4" + ts-api-utils "^1.0.1" "@typescript-eslint/typescript-estree@7.8.0": version "7.8.0" @@ -1540,27 +1541,26 @@ "@typescript-eslint/typescript-estree" "7.8.0" semver "^7.6.0" -"@typescript-eslint/utils@^5.10.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-5.62.0.tgz#141e809c71636e4a75daa39faed2fb5f4b10df86" - integrity sha512-n8oxjeb5aIbPFEtmQxQYOLI0i9n5ySBEY/ZEHHZqKQSFnxio1rv6dthascc9dLuwrL0RC5mPCxB7vnAVGAYWAQ== +"@typescript-eslint/utils@^6.0.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-6.21.0.tgz#4714e7a6b39e773c1c8e97ec587f520840cd8134" + integrity sha512-NfWVaC8HP9T8cbKQxHcsJBY5YE1O33+jpMwN45qzWWaPDZgLIbo12toGMWnmhvCpd3sIxkpDw3Wv1B3dYrbDQQ== dependencies: - "@eslint-community/eslint-utils" "^4.2.0" - "@types/json-schema" "^7.0.9" - "@types/semver" "^7.3.12" - "@typescript-eslint/scope-manager" "5.62.0" - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/typescript-estree" "5.62.0" - eslint-scope "^5.1.1" - semver "^7.3.7" - -"@typescript-eslint/visitor-keys@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-5.62.0.tgz#2174011917ce582875954ffe2f6912d5931e353e" - integrity sha512-07ny+LHRzQXepkGg6w0mFY41fVUNBrL2Roj/++7V1txKugfjm/Ci/qSND03r2RhlJhJYMcTn9AhhSSqQp0Ysyw== - dependencies: - "@typescript-eslint/types" "5.62.0" - eslint-visitor-keys "^3.3.0" + "@eslint-community/eslint-utils" "^4.4.0" + "@types/json-schema" "^7.0.12" + "@types/semver" "^7.5.0" + "@typescript-eslint/scope-manager" "6.21.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/typescript-estree" "6.21.0" + semver "^7.5.4" + +"@typescript-eslint/visitor-keys@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-6.21.0.tgz#87a99d077aa507e20e238b11d56cc26ade45fe47" + integrity sha512-JJtkDduxLi9bivAB+cYOVMtbkqdPOhZ+ZI5LC47MIRrDV4Yn2o+ZnW10Nkmr28xRpSpdJ6Sm42Hjf2+REYXm0A== + dependencies: + "@typescript-eslint/types" "6.21.0" + eslint-visitor-keys "^3.4.1" "@typescript-eslint/visitor-keys@7.8.0": version "7.8.0" @@ -3050,12 +3050,12 @@ escape-string-regexp@^4.0.0: resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz#14ba83a5d373e3d311e5afca29cf5bfad965bf34" integrity sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA== -eslint-plugin-jest@27.9.0: - version "27.9.0" - resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-27.9.0.tgz#7c98a33605e1d8b8442ace092b60e9919730000b" - integrity sha512-QIT7FH7fNmd9n4se7FFKHbsLKGQiw885Ds6Y/sxKgCZ6natwCsXdgPOADnYVxN2QrRweF0FZWbJ6S7Rsn7llug== +eslint-plugin-jest@28.4.0: + version "28.4.0" + resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-28.4.0.tgz#213be88f799a35ca9d63ce1a30081bb32b8da765" + integrity sha512-ORVHiFPC8RQxHLyQJ37MxNilK9k+cPzjHz65T8gAbpYZunGutXvKqwfM3WXBCvFDF1QBeYJJu9LB/i5cuXBs+g== dependencies: - "@typescript-eslint/utils" "^5.10.0" + "@typescript-eslint/utils" "^6.0.0" eslint-plugin-unicorn@52.0.0: version "52.0.0" @@ -3079,7 +3079,7 @@ eslint-plugin-unicorn@52.0.0: semver "^7.5.4" strip-indent "^3.0.0" -eslint-scope@5.1.1, eslint-scope@^5.1.1: +eslint-scope@5.1.1: version "5.1.1" resolved "https://registry.yarnpkg.com/eslint-scope/-/eslint-scope-5.1.1.tgz#e786e59a66cb92b3f6c1fb0d508aab174848f48c" integrity sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw== @@ -5299,6 +5299,13 @@ min-indent@^1.0.0, min-indent@^1.0.1: resolved "https://registry.yarnpkg.com/min-indent/-/min-indent-1.0.1.tgz#a63f681673b30571fbe8bc25686ae746eefa9869" integrity sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg== +minimatch@9.0.3: + version "9.0.3" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-9.0.3.tgz#a6e00c3de44c3a542bfaae70abfc22420a6da825" + integrity sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg== + dependencies: + brace-expansion "^2.0.1" + minimatch@^3.0.4, minimatch@^3.0.5, minimatch@^3.1.1, minimatch@^3.1.2: version "3.1.2" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" @@ -6278,7 +6285,7 @@ semver@^6.3.0, semver@^6.3.1: resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4" integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA== -semver@^7.3.4, semver@^7.3.7, semver@^7.5.3, semver@^7.5.4, semver@^7.6.0: +semver@^7.3.4, semver@^7.5.3, semver@^7.5.4, semver@^7.6.0: version "7.6.0" resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.0.tgz#1a46a4db4bffcccd97b743b5005c8325f23d4e2d" integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg== @@ -6916,7 +6923,7 @@ trim-newlines@^4.0.2: resolved "https://registry.yarnpkg.com/trim-newlines/-/trim-newlines-4.1.1.tgz#28c88deb50ed10c7ba6dc2474421904a00139125" integrity sha512-jRKj0n0jXWo6kh62nA5TEh3+4igKDXLvzBJcPpiizP7oOolUrYIxmVBG9TOtHYFHoddUk6YvAkGeGoSVTXfQXQ== -ts-api-utils@^1.3.0: +ts-api-utils@^1.0.1, ts-api-utils@^1.3.0: version "1.3.0" resolved "https://registry.yarnpkg.com/ts-api-utils/-/ts-api-utils-1.3.0.tgz#4b490e27129f1e8e686b45cc4ab63714dc60eea1" integrity sha512-UQMIo7pb8WRomKR1/+MFVLTroIvDVtMX3K6OUir8ynLyzB8Jeriont2bTAtmNPa1ekAgN7YPDyf6V+ygrdU+eQ== @@ -6954,23 +6961,11 @@ ts-node@10.9.2: v8-compile-cache-lib "^3.0.1" yn "3.1.1" -tslib@^1.8.1: - version "1.14.1" - resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.14.1.tgz#cf2d38bdc34a134bcaf1091c41f6619e2f672d00" - integrity sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg== - tslib@^2.0.0, tslib@^2.1.0, tslib@^2.3.0, tslib@^2.4.0, tslib@^2.6.2: version "2.6.2" resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae" integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q== -tsutils@^3.21.0: - version "3.21.0" - resolved "https://registry.yarnpkg.com/tsutils/-/tsutils-3.21.0.tgz#b48717d394cea6c1e096983eed58e9d61715b623" - integrity sha512-mHKK3iUXL+3UF6xL5k0PEhKRUBKPBCv/+RkEOpjRWxxx27KKRBmmA60A9pgOUvMi8GKhRMPEmjBRPzs2W7O1OA== - dependencies: - tslib "^1.8.1" - type-check@^0.4.0, type-check@~0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1" From c8bf9753d7423c3020ac66b97caf2099cda35110 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:18:46 +0900 Subject: [PATCH 036/105] MINOR: [JS] Bump @types/node from 20.12.7 to 20.12.8 in /js (#41526) Bumps [@ types/node](https://github.com/DefinitelyTyped/DefinitelyTyped/tree/HEAD/types/node) from 20.12.7 to 20.12.8.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ types/node&package-manager=npm_and_yarn&previous-version=20.12.7&new-version=20.12.8)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/yarn.lock | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/js/yarn.lock b/js/yarn.lock index dbf79115d6412..abde21f603382 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1370,10 +1370,10 @@ dependencies: "@types/node" "*" -"@types/node@*": - version "20.12.3" - resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.3.tgz#d6658c2c7776c1cad93534bb45428195ed840c65" - integrity sha512-sD+ia2ubTeWrOu+YMF+MTAB7E+O7qsMqAbMfW7DG3K1URwhZ5hN1pLlRVGbf4wDFzSfikL05M17EyorS86jShw== +"@types/node@*", "@types/node@^20.12.7": + version "20.12.8" + resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.8.tgz#35897bf2bfe3469847ab04634636de09552e8256" + integrity sha512-NU0rJLJnshZWdE/097cdCBbyW1h4hEg0xpovcoAQYHl8dnEyp/NAOiE45pvc+Bd1Dt+2r94v2eGFpQJ4R7g+2w== dependencies: undici-types "~5.26.4" @@ -1382,13 +1382,6 @@ resolved "https://registry.yarnpkg.com/@types/node/-/node-13.13.52.tgz#03c13be70b9031baaed79481c0c0cfb0045e53f7" integrity sha512-s3nugnZumCC//n4moGGe6tkNMyYEdaDBitVjwPxXmR5lnMG5dHePinH2EdxkG3Rh1ghFHHixAG4NJhpJW1rthQ== -"@types/node@^20.12.7": - version "20.12.7" - resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.7.tgz#04080362fa3dd6c5822061aa3124f5c152cff384" - integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg== - dependencies: - undici-types "~5.26.4" - "@types/normalize-package-data@^2.4.0": version "2.4.4" resolved "https://registry.yarnpkg.com/@types/normalize-package-data/-/normalize-package-data-2.4.4.tgz#56e2cc26c397c038fab0e3a917a12d5c5909e901" From cc9e65fb80db2e2d706b1776c52a88bb1c983533 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 07:58:31 -0700 Subject: [PATCH 037/105] MINOR: [C#] Bump Grpc.Tools from 2.62.0 to 2.63.0 in /csharp (#41523) Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.62.0 to 2.63.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.62.0&new-version=2.63.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj | 2 +- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj index 9a3cf190cc376..7314b8207fef6 100644 --- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj +++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 04b8a7dc734f0..780da3ad39081 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + From 56437409d1f99852a6b9486b1620c3ed12ff3e5c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 12:36:02 -0400 Subject: [PATCH 038/105] MINOR: [Go] Bump github.com/hamba/avro/v2 from 2.20.1 to 2.21.1 in /go (#41512) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/hamba/avro/v2](https://github.com/hamba/avro) from 2.20.1 to 2.21.1.
Release notes

Sourced from github.com/hamba/avro/v2's releases.

v2.21.1

What's Changed

New Contributors

Full Changelog: https://github.com/hamba/avro/compare/v2.21.0...v2.21.1

v2.21.0

What's Changed

New Contributors

Full Changelog: https://github.com/hamba/avro/compare/v2.20.1...v2.21.0

Commits
  • 5dde47b fix: support 32bit builds (#390)
  • ad836ba chore: bump golangci/golangci-lint-action from 4 to 5 in the all group (#388)
  • e42dea1 fix: Union Decoder uses readInt (#387)
  • 2461d45 fix: reader int/long setting head > tail (#385)
  • 84f9b10 fix: readByte returns errors on unexpected EOF (#383)
  • 141e857 fix: reader returns errors on unexpected EOF (#382)
  • f138d7f fix: handle short read errors on arrays and maps (#379)
  • b43fe48 feat: add max slice alloc size config (#376)
  • 0b21284 Check for max allocation (#374)
  • 7a2eb5f feat: support slices for nullable unions (#372)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/hamba/avro/v2&package-manager=go_modules&previous-version=2.20.1&new-version=2.21.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 79c3cc3981231..547127a470a61 100644 --- a/go/go.mod +++ b/go/go.mod @@ -47,7 +47,7 @@ require ( require ( github.com/google/uuid v1.6.0 - github.com/hamba/avro/v2 v2.20.1 + github.com/hamba/avro/v2 v2.21.1 github.com/substrait-io/substrait-go v0.4.2 github.com/tidwall/sjson v1.2.5 ) diff --git a/go/go.sum b/go/go.sum index e8c2fde15181a..29490b9da2ca3 100644 --- a/go/go.sum +++ b/go/go.sum @@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hamba/avro/v2 v2.20.1 h1:3WByQiVn7wT7d27WQq6pvBRC00FVOrniP6u67FLA/2E= -github.com/hamba/avro/v2 v2.20.1/go.mod h1:xHiKXbISpb3Ovc809XdzWow+XGTn+Oyf/F9aZbTLAig= +github.com/hamba/avro/v2 v2.21.1 h1:400/jTdLWQ3ib58y83VXlTJKijRouYQszY1SO0cMGt4= +github.com/hamba/avro/v2 v2.21.1/go.mod h1:ouJ4PkiAEP49u0lAtQyd5Gv04MehKj+7lXwD3zpLpY0= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= From 2b0647230536ffb2fd2d59af11acdb4674ed44c3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 12:36:25 -0400 Subject: [PATCH 039/105] MINOR: [Go] Bump google.golang.org/protobuf from 1.33.0 to 1.34.0 in /go (#41513) Bumps google.golang.org/protobuf from 1.33.0 to 1.34.0. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=google.golang.org/protobuf&package-manager=go_modules&previous-version=1.33.0&new-version=1.34.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 547127a470a61..35fd9b9915c0b 100644 --- a/go/go.mod +++ b/go/go.mod @@ -41,7 +41,7 @@ require ( golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 google.golang.org/grpc v1.63.2 - google.golang.org/protobuf v1.33.0 + google.golang.org/protobuf v1.34.0 modernc.org/sqlite v1.29.6 ) diff --git a/go/go.sum b/go/go.sum index 29490b9da2ca3..bf33fed6c4c97 100644 --- a/go/go.sum +++ b/go/go.sum @@ -138,8 +138,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.0 h1:Qo/qEd2RZPCf2nKuorzksSknv0d3ERwp1vFG38gSmH4= +google.golang.org/protobuf v1.34.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From 493d456d8c6eff21659dc87a2dee32abd0be5ffd Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 3 May 2024 14:43:51 -0400 Subject: [PATCH 040/105] GH-41507: [MATLAB][CI] Pass `strict: true` to `matlab-actions/run-tests@v2` (#41530) ### Rationale for this change The MATLAB CI jobs should fail if any one of the unit tests issues a `warning`. Currently, the MATLAB CI jobs only fail if there is a verification failure. Passing the argument `strict: true` to `matlab-actions/run-tests@ v2` will ensure MATLAB jobs will fail if a test warning is issued. See the [`matlab-actions/run-tests@ v2` documentation](https://github.com/matlab-actions/run-tests/?tab=readme-ov-file#run-matlab-tests) for more details. ### What changes are included in this PR? 1. Pass `strict: true` argument to `matlab-actions/setup-matlab@ v2` ### Are these changes tested? N/A (relying on existing tests). ### Are there any user-facing changes? No. * GitHub Issue: #41507 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- .github/workflows/matlab.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 8a0de8a365661..2ae33d1e8d6c6 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -96,6 +96,7 @@ jobs: uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test + strict: true macos: name: AMD64 macOS 12 MATLAB runs-on: macos-12 @@ -135,7 +136,8 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true windows: name: AMD64 Windows 2022 MATLAB runs-on: windows-2022 @@ -181,4 +183,5 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true From e7f5f810ac1235ee835a6ec62fb25d2f05e5d542 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 3 May 2024 14:46:01 -0400 Subject: [PATCH 041/105] GH-41531: [MATLAB][Packaging] Bump `matlab-actions/setup-matlab` and `matlab-actions/run-command` from `v1` to `v2` in the `crossbow` job (#41532) ### Rationale for this change I noticed `dev/tasks/matlab/github.yml` is still using `matlab-actions/setup-matlab@ v1`, which is no longer supported. See this [log](https://github.com/ursacomputing/crossbow/actions/runs/8928507510/job/24524230971#step:4:11) file. We should use `matlab-actions/setup-matlab@ v2` instead. We should also use `matlab-actions/run-command@ v2` instead of `matlab-actions/run-command@ v1`. ### What changes are included in this PR? 1. Replaced `matlab-actions/setup-matlab@ v1` with `matlab-actions/setup-matlab@ v2` in `dev/tasks/matlab/github.yml` 2. Replaced `matlab-actions/run-command@ v1` with `matlab-actions/run-command@ v2` in `dev/tasks/matlab/github.yml` ### Are these changes tested? N/A (I will trigger a crossbow job to verify the packaging workflow works as expected still). ### Are there any user-facing changes? No. * GitHub Issue: #41531 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- dev/tasks/matlab/github.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index 7840fd176705c..963c85f6e11bf 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -29,7 +29,7 @@ jobs: - name: Install ninja-build run: sudo apt-get update && sudo apt-get install ninja-build - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Build MATLAB Interface @@ -66,7 +66,7 @@ jobs: - name: Install ninja-build run: brew install ninja - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Build MATLAB Interface @@ -101,7 +101,7 @@ jobs: steps: {{ macros.github_checkout_arrow()|indent }} - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Install sccache @@ -147,7 +147,7 @@ jobs: cp arrow/LICENSE.txt arrow/matlab/install/arrow_matlab/LICENSE.txt cp arrow/NOTICE.txt arrow/matlab/install/arrow_matlab/NOTICE.txt - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Run commands @@ -156,7 +156,7 @@ jobs: ARROW_MATLAB_TOOLBOX_FOLDER: arrow/matlab/install/arrow_matlab ARROW_MATLAB_TOOLBOX_OUTPUT_FOLDER: artifacts/matlab-dist ARROW_MATLAB_TOOLBOX_VERSION: {{ arrow.no_rc_version }} - uses: matlab-actions/run-command@v1 + uses: matlab-actions/run-command@v2 with: command: packageMatlabInterface {{ macros.github_upload_releases(["artifacts/matlab-dist/*.mltbx"])|indent }} From 7cd9c6fbd313c9afa0673d85415a84fd199951c9 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 3 May 2024 19:17:47 -0400 Subject: [PATCH 042/105] GH-41534: [Go] Fix mem leak importing 0 length C Array (#41535) ### What changes are included in this PR? If the `imp.alloc.bufCount` is 0, indicating we did not import any buffers from the provided C ArrowArray object, then we are free to not only call the release callback (which we already do) but also we need to free the temp ArrowArray we allocated to move the source to. This was uncovered by https://github.com/apache/arrow-adbc/pull/1808 * GitHub Issue: #41534 Authored-by: Matt Topol Signed-off-by: David Li --- go/arrow/cdata/cdata.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/arrow/cdata/cdata.go b/go/arrow/cdata/cdata.go index b86898277bf47..00d1f351eaf11 100644 --- a/go/arrow/cdata/cdata.go +++ b/go/arrow/cdata/cdata.go @@ -448,6 +448,7 @@ func (imp *cimporter) doImportArr(src *CArrowArray) error { defer func() { if imp.alloc.bufCount == 0 { C.ArrowArrayRelease(imp.arr) + C.free(unsafe.Pointer(imp.arr)) } }() From 2ba129f0a28e0ec93c7f383a64c54bef24e26f59 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 06:56:08 +0900 Subject: [PATCH 043/105] MINOR: [JS] Bump @swc/helpers from 0.5.10 to 0.5.11 in /js (#41486) Bumps [@ swc/helpers](https://github.com/swc-project/swc) from 0.5.10 to 0.5.11.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ swc/helpers&package-manager=npm_and_yarn&previous-version=0.5.10&new-version=0.5.11)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/yarn.lock b/js/yarn.lock index abde21f603382..eb7ed33520f0a 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1199,9 +1199,9 @@ integrity sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ== "@swc/helpers@^0.5.10": - version "0.5.10" - resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.10.tgz#5720082d007197cd85743dd599198097126a3f6e" - integrity sha512-CU+RF9FySljn7HVSkkjiB84hWkvTaI3rtLvF433+jRSBL2hMu3zX5bGhHS8C80SM++h4xy8hBSnUHFQHmRXSBw== + version "0.5.11" + resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.11.tgz#5bab8c660a6e23c13b2d23fcd1ee44a2db1b0cb7" + integrity sha512-YNlnKRWF2sVojTpIyzwou9XoTNbzbzONwRhOoniEioF1AtaitTvVZblaQRrAzChWQ1bLYyYSWzM18y4WwgzJ+A== dependencies: tslib "^2.4.0" From 4cf44b4bc3ab053b03c937d3327d43c105790462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JB=20Onofr=C3=A9?= Date: Mon, 6 May 2024 05:39:34 +0200 Subject: [PATCH 044/105] MINOR: add jbonofre in collaborators list (#41528) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Add `jbonofre` to the ASF collaborators list. ### What changes are included in this PR? Update `.asf.yaml`. ### Are these changes tested? NA ### Are there any user-facing changes? No Authored-by: JB Onofré Signed-off-by: Jacob Wujciak-Jens --- .asf.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.asf.yaml b/.asf.yaml index 40b961dc6e885..760a830ef98c7 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -23,6 +23,7 @@ github: - benibus - danepitkin - davisusanibar + - jbonofre - js8544 - vibhatha From d10ebf055a393c94a693097db1dca08ff86745bd Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 6 May 2024 09:28:22 -0400 Subject: [PATCH 045/105] MINOR: [R] fix no visible global function definition: left_join (#41542) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Followup to #41350, fixes a check NOTE that caused. ### What changes are included in this PR? `dplyr::` in two places. ### Are these changes tested? Check will be clean. ### Are there any user-facing changes? 🙅 --- r/R/dplyr-mutate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 880f7799e6316..72882b6afd964 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -84,12 +84,12 @@ mutate.arrow_dplyr_query <- function(.data, agg_query$aggregations <- mask$.aggregations agg_query <- collapse.arrow_dplyr_query(agg_query) if (length(grv)) { - out <- left_join(out, agg_query, by = grv) + out <- dplyr::left_join(out, agg_query, by = grv) } else { # If there are no group_by vars, add a scalar column to both and join on that agg_query$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) out$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) - out <- left_join(out, agg_query, by = "..tempjoin") + out <- dplyr::left_join(out, agg_query, by = "..tempjoin") } } From 3a54e68c69b028afe6d5fbb58eb0c4520dca1308 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 7 May 2024 04:40:51 +0800 Subject: [PATCH 046/105] MINOR: [Dev] Add zanmato1984 and ZhangHuiGui in collaborators list (#41544) ### Rationale for this change Recently zanmato1984 and ZhangHuiGui is active on arrow-compute and arrow-acero module, which lacks maintainer. The contributions can be seem below: * https://github.com/apache/arrow/commits?author=zanmato1984 * https://github.com/apache/arrow/commits?author=ZhangHuiGui I promote them as collaborators ### What changes are included in this PR? Changes `.asf.yaml` ### Are these changes tested? No ### Are there any user-facing changes? No Authored-by: mwish Signed-off-by: Sutou Kouhei --- .asf.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.asf.yaml b/.asf.yaml index 760a830ef98c7..1eb019fea9af1 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -26,6 +26,8 @@ github: - jbonofre - js8544 - vibhatha + - zanmato1984 + - ZhangHuiGui notifications: commits: commits@arrow.apache.org From 52321377cc9fbcb8678577f10232aea984a235f5 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Tue, 7 May 2024 05:13:44 -0400 Subject: [PATCH 047/105] GH-40997: [C++] Get null_bit_id according to are_cols_in_encoding_order in NullUpdateColumnToRow_avx2 (#40998) ### Rationale for this change Recently, we find that the compare internal's avx2 function NullUpdateColumnToRowImp_avx2 lost the are_cols_in_encoding_order check when get null_bit_id. It may cause grouper's compare result wrong(are_cols_in_encoding_order = true in grouper). ### What changes are included in this PR? Get `null_bit_id` according to `are_cols_in_encoding_order` in NullUpdateColumnToRow_avx2. ### Are there any user-facing changes? No Co-authored-by laotan332 Co-authored-by ZhangHuiGui <2689496754@ qq.com> * GitHub Issue: #40997 Lead-authored-by: ZhangHuiGui Co-authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/CMakeLists.txt | 3 +- cpp/src/arrow/compute/row/compare_internal.cc | 41 +++++------ cpp/src/arrow/compute/row/compare_internal.h | 25 ++++--- .../compute/row/compare_internal_avx2.cc | 20 +++--- cpp/src/arrow/compute/row/grouper_test.cc | 68 +++++++++++++++++++ cpp/src/arrow/compute/row/row_internal.cc | 3 +- 6 files changed, 116 insertions(+), 44 deletions(-) create mode 100644 cpp/src/arrow/compute/row/grouper_test.cc diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index badcf4f2f26ac..fb778be113029 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -90,7 +90,8 @@ add_arrow_test(internals_test light_array_test.cc registry_test.cc key_hash_test.cc - row/compare_test.cc) + row/compare_test.cc + row/grouper_test.cc) add_arrow_compute_test(expression_test SOURCES expression_test.cc) diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 078a8287c71c0..98aea9011266c 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -36,22 +36,22 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order) { + bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return; } uint32_t num_processed = 0; #if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { - num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + num_processed = NullUpdateColumnToRow_avx2( + use_selection, id_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, match_bytevector); } #endif - uint32_t null_bit_id = - are_cols_in_encoding_order ? id_col : rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -363,10 +363,9 @@ void KeyCompare::CompareColumnsToRows( continue; } - uint32_t offset_within_row = rows.metadata().encoded_field_offset( - are_cols_in_encoding_order - ? static_cast(icol) - : rows.metadata().pos_after_encoding(static_cast(icol))); + uint32_t offset_within_row = + rows.metadata().encoded_field_offset(ColIdInEncodingOrder( + rows, static_cast(icol), are_cols_in_encoding_order)); if (col.metadata().is_fixed_length) { if (sel_left_maybe_null) { CompareBinaryColumnToRow( @@ -375,9 +374,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { // Version without using selection vector CompareBinaryColumnToRow( @@ -386,9 +384,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); @@ -414,9 +411,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { if (ivarbinary == 0) { CompareVarBinaryColumnToRow( @@ -429,9 +425,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index b039ca97ff978..16002ee5184e9 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -43,13 +43,19 @@ class ARROW_EXPORT KeyCompare { uint8_t* out_match_bitvector_maybe_null = NULLPTR); private: + static uint32_t ColIdInEncodingOrder(const RowTableImpl& rows, uint32_t id_col, + bool are_cols_in_encoding_order) { + return are_cols_in_encoding_order ? id_col + : rows.metadata().pos_after_encoding(id_col); + } + template static void NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order); + bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static void CompareBinaryColumnToRowHelper( @@ -92,7 +98,8 @@ class ARROW_EXPORT KeyCompare { static uint32_t NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector); + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static uint32_t CompareBinaryColumnToRowHelper_avx2( @@ -118,13 +125,11 @@ class ARROW_EXPORT KeyCompare { static uint32_t AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B); - static uint32_t NullUpdateColumnToRow_avx2(bool use_selection, uint32_t id_col, - uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, - uint8_t* match_bytevector); + static uint32_t NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, + bool are_cols_in_encoding_order, uint8_t* match_bytevector); static uint32_t CompareBinaryColumnToRow_avx2( bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index ff407c51b83cb..18f656a2e458d 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -39,12 +39,14 @@ template uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector) { + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return num_rows_to_compare; } - uint32_t null_bit_id = rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -569,7 +571,7 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector) { + bool are_cols_in_encoding_order, uint8_t* match_bytevector) { int64_t num_rows_safe = TailSkipForSIMD::FixBitAccess(sizeof(uint32_t), col.length(), col.bit_offset(0)); if (sel_left_maybe_null) { @@ -580,13 +582,13 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( } if (use_selection) { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } else { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } } diff --git a/cpp/src/arrow/compute/row/grouper_test.cc b/cpp/src/arrow/compute/row/grouper_test.cc new file mode 100644 index 0000000000000..1e853be5e4af7 --- /dev/null +++ b/cpp/src/arrow/compute/row/grouper_test.cc @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/row/grouper.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" + +namespace arrow { +namespace compute { + +// Specialized case for GH-40997 +TEST(Grouper, ResortedColumnsWithLargeNullRows) { + const uint64_t num_rows = 1024; + + // construct random array with plenty of null values + const int32_t kSeed = 42; + const int32_t min = 0; + const int32_t max = 100; + const double null_probability = 0.3; + const double true_probability = 0.5; + auto rng = random::RandomArrayGenerator(kSeed); + auto b_arr = rng.Boolean(num_rows, true_probability, null_probability); + auto i32_arr = rng.Int32(num_rows, min, max, null_probability); + auto i64_arr = rng.Int64(num_rows, min, max * 10, null_probability); + + // construct batches with columns which will be resorted in the grouper make + std::vector exec_batches = {ExecBatch({i64_arr, i32_arr, b_arr}, num_rows), + ExecBatch({i32_arr, i64_arr, b_arr}, num_rows), + ExecBatch({i64_arr, b_arr, i32_arr}, num_rows), + ExecBatch({i32_arr, b_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i32_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i64_arr, i32_arr}, num_rows)}; + + const int num_batches = static_cast(exec_batches.size()); + std::vector group_num_vec; + group_num_vec.reserve(num_batches); + + for (const auto& exec_batch : exec_batches) { + ExecSpan span(exec_batch); + ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make(span.GetTypes())); + ASSERT_OK_AND_ASSIGN(Datum group_ids, grouper->Consume(span)); + group_num_vec.emplace_back(grouper->num_groups()); + } + + for (int i = 1; i < num_batches; i++) { + ASSERT_EQ(group_num_vec[i - 1], group_num_vec[i]); + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index f6a62c09fcf24..469205e9b008d 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -66,7 +66,8 @@ void RowTableMetadata::FromColumnMetadataVector( // // Columns are sorted based on the size in bytes of their fixed-length part. // For the varying-length column, the fixed-length part is the 32-bit field storing - // cumulative length of varying-length fields. + // cumulative length of varying-length fields. This is to make the memory access of + // each individual column within the encoded row alignment-friendly. // // The rules are: // From b719408f4abd9921f35935fe0de771f3b856efd1 Mon Sep 17 00:00:00 2001 From: hemidark <67875833+hemidark@users.noreply.github.com> Date: Tue, 7 May 2024 02:44:48 -0700 Subject: [PATCH 048/105] GH-40560: [Python] RunEndEncodedArray.from_arrays: bugfix for Array arguments (#40560) (#41093) ### Rationale for this change The documentation suggests that `RunEndEncodedArray.from_arrays` takes two `Array` parameters, as would be expected of a `from_arrays` method. However, if given an `Array` instance for the `run_ends` parameter, it errors because `Array.__getitem__` returns a pyarrow scalar instead of a native Python integer. ### What changes are included in this PR? * Handle `Array` parameters for `run_ends` by unconditionally coercing the logical length to a pyarrow scalar, then to a Python native value. ### Are these change tested? Yes. Augmented the existing unit tests to test with `Array` as well as Python lists, and check that the data types of the `Array` instances correctly carry over to the data type of the `RunEndEncodedArray`. ### Are there any user-facing changes? Not apart from the bugfix; this was the minimum necessary change to make `Array` parameters work. `RunEndEncodedArray.from_arrays` continues to support e.g. python lists as before. * GitHub Issue: #40560 Authored-by: Hemidark Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 2 +- python/pyarrow/tests/test_array.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6a11b19ffcdf5..946c82b258241 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3984,7 +3984,7 @@ cdef class RunEndEncodedArray(Array): ------- RunEndEncodedArray """ - logical_length = run_ends[-1] if len(run_ends) > 0 else 0 + logical_length = scalar(run_ends[-1]).as_py() if len(run_ends) > 0 else 0 return RunEndEncodedArray._from_arrays(type, True, logical_length, run_ends, values, 0) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 6a190957879d3..3754daeb9b4bd 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3578,12 +3578,23 @@ def check_run_end_encoded_from_arrays_with_type(ree_type=None): check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) +def check_run_end_encoded_from_typed_arrays(ree_type): + run_ends = [3, 5, 10, 19] + values = [1, 2, 1, 3] + typed_run_ends = pa.array(run_ends, ree_type.run_end_type) + typed_values = pa.array(values, ree_type.value_type) + ree_array = pa.RunEndEncodedArray.from_arrays(typed_run_ends, typed_values) + assert ree_array.type == ree_type + check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) + + def test_run_end_encoded_from_arrays(): check_run_end_encoded_from_arrays_with_type() for run_end_type in [pa.int16(), pa.int32(), pa.int64()]: for value_type in [pa.uint32(), pa.int32(), pa.uint64(), pa.int64()]: ree_type = pa.run_end_encoded(run_end_type, value_type) check_run_end_encoded_from_arrays_with_type(ree_type) + check_run_end_encoded_from_typed_arrays(ree_type) def test_run_end_encoded_from_buffers(): From c79b6a593e21c10dc65e06a2717809ab83fd31db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 7 May 2024 14:54:10 +0200 Subject: [PATCH 049/105] GH-41566: [CI][Packaging] macOS wheel for Catalina fails to build on macOS arm64 (#41567) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Wheels for macOS catalina are failing ### What changes are included in this PR? Use macos-13 instead of (latest) ARM ### Are these changes tested? Yes, via archery ### Are there any user-facing changes? No * GitHub Issue: #41566 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- dev/tasks/tasks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 52a235c688eda..126b0fcb6f76a 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -409,7 +409,7 @@ tasks: arrow_jemalloc: "ON" python_version: "{{ python_version }}" macos_deployment_target: "{{ macos_version }}" - runs_on: "macos-latest" + runs_on: "macos-13" vcpkg_arch: "amd64" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl From 03f8ae754ede16f118ccdba0abb593b1461024aa Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 7 May 2024 09:42:55 -0400 Subject: [PATCH 050/105] GH-41540: [R] Simplify arrow_eval() logic and bindings environments (#41537) ### Rationale for this change NSE is hard enough. I wanted to see if I could remove some layers of complexity. ### What changes are included in this PR? * There no longer are separate collections of `agg_funcs` and `nse_funcs`. Now that the aggregation functions return Expressions (https://github.com/apache/arrow/pull/41223), there's no reason to treat them separately. All bindings return Expressions now. * Both are removed and functions are just stored in `.cache$functions`. There was a note wondering why both `nse_funcs` and that needed to exist. They don't. * `arrow_mask()` no longer has an `aggregations` argument: agg functions are always present. * Because agg functions are always present, `filter` and `arrange` now have to check for whether the expressions passed to them contain aggregations--this is supported in regular dplyr but we have deferred supporting it here for now (see https://github.com/apache/arrow/pull/41350). If we decide we want to support it later, these checks are the entry points where we'd drop in the `left_join()` as in `mutate()`. * The logic of evaluating expresssions in `filter()` has been simplified. * Assorted other cleanups: `register_binding()` has two fewer arguments, for example, and the duplicate functions for referencing agg_funcs are gone. There is one more refactor I intend to pursue, and that's to rework abandon_ship and how arrow_eval does error handling, but I ~may~ will defer that to a followup. ### Are these changes tested? Yes, though I'll add some more for filter/aggregate in the followup since I'm reworking things there. ### Are there any user-facing changes? There are a couple of edge cases where the error message will change subtly. For example, if you supplied a comma-separated list of filter expressions, and more than one of them did not evaluate, previously you would be informed of all of the failures; now, we error on the first one. I don't think this is concerning. * GitHub Issue: #41540 --- r/R/dplyr-arrange.R | 8 ++ r/R/dplyr-eval.R | 17 +-- r/R/dplyr-filter.R | 54 +++------ r/R/dplyr-funcs-agg.R | 26 ++--- r/R/dplyr-funcs.R | 119 ++++---------------- r/R/dplyr-mutate.R | 2 +- r/R/dplyr-summarize.R | 2 +- r/R/udf.R | 7 +- r/man/register_binding.Rd | 45 +------- r/tests/testthat/test-dataset-dplyr.R | 2 +- r/tests/testthat/test-dplyr-filter.R | 9 +- r/tests/testthat/test-dplyr-funcs.R | 30 ++--- r/tests/testthat/test-dplyr-summarize.R | 28 ++--- r/tests/testthat/test-udf.R | 14 +-- r/vignettes/developers/writing_bindings.Rmd | 7 +- 15 files changed, 109 insertions(+), 261 deletions(-) diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index f91cd14211e0f..c8594c77df000 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -47,6 +47,14 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { msg <- paste("Expression", names(sorts)[i], "not supported in Arrow") return(abandon_ship(call, .data, msg)) } + if (length(mask$.aggregations)) { + # dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in arrange() in Arrow") + return(abandon_ship(call, .data, msg)) + } descs[i] <- x[["desc"]] } .data$arrange_vars <- c(sorts, .data$arrange_vars) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index ff1619ce944d0..211c26cecce8c 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -121,24 +121,9 @@ arrow_not_supported <- function(msg) { } # Create a data mask for evaluating a dplyr expression -arrow_mask <- function(.data, aggregation = FALSE) { +arrow_mask <- function(.data) { f_env <- new_environment(.cache$functions) - if (aggregation) { - # Add the aggregation functions to the environment. - for (f in names(agg_funcs)) { - f_env[[f]] <- agg_funcs[[f]] - } - } else { - # Add functions that need to error hard and clear. - # Some R functions will still try to evaluate on an Expression - # and return NA with a warning :exploding_head: - fail <- function(...) stop("Not implemented") - for (f in c("mean", "sd")) { - f_env[[f]] <- fail - } - } - # Assign the schema to the expressions schema <- .data$.data$schema walk(.data$selected_columns, ~ (.$schema <- schema)) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index d85fa16af2e71..69decbd76655f 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -35,48 +35,24 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) } # tidy-eval the filter expressions inside an Arrow data_mask - filters <- lapply(expanded_filters, arrow_eval, arrow_mask(out)) - bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) - if (any(bad_filters)) { - # This is similar to abandon_ship() except that the filter eval is - # vectorized, and we apply filters that _did_ work before abandoning ship - # with the rest - expr_labs <- map_chr(expanded_filters[bad_filters], format_expr) - if (query_on_dataset(out)) { - # Abort. We don't want to auto-collect if this is a Dataset because that - # could blow up, too big. - stop( - "Filter expression not supported for Arrow Datasets: ", - oxford_paste(expr_labs, quote = FALSE), - "\nCall collect() first to pull data into R.", - call. = FALSE - ) - } else { - arrow_errors <- map2_chr( - filters[bad_filters], expr_labs, - handle_arrow_not_supported - ) - if (length(arrow_errors) == 1) { - msg <- paste0(arrow_errors, "; ") - } else { - msg <- paste0("* ", arrow_errors, "\n", collapse = "") - } - warning( - msg, "pulling data into R", - immediate. = TRUE, - call. = FALSE - ) - # Set any valid filters first, then collect and then apply the invalid ones in R - out <- dplyr::collect(set_filters(out, filters[!bad_filters])) - if (by$from_by) { - out <- dplyr::ungroup(out) - } - return(dplyr::filter(out, !!!expanded_filters[bad_filters], .by = {{ .by }})) + mask <- arrow_mask(out) + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) + if (inherits(filt, "try-error")) { + msg <- handle_arrow_not_supported(filt, format_expr(expr)) + return(abandon_ship(match.call(), .data, msg)) + } + if (length(mask$.aggregations)) { + # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in filter() in Arrow") + return(abandon_ship(match.call(), .data, msg)) } + out <- set_filters(out, filt) } - out <- set_filters(out, filters) - if (by$from_by) { out$group_by_vars <- character() } diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index 9411ce5ce6faf..c0c4eb3089425 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -29,56 +29,56 @@ # you can use list_compute_functions("^hash_") register_bindings_aggregate <- function() { - register_binding_agg("base::sum", function(..., na.rm = FALSE) { + register_binding("base::sum", function(..., na.rm = FALSE) { set_agg( fun = "sum", data = ensure_one_arg(list2(...), "sum"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::prod", function(..., na.rm = FALSE) { + register_binding("base::prod", function(..., na.rm = FALSE) { set_agg( fun = "product", data = ensure_one_arg(list2(...), "prod"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::any", function(..., na.rm = FALSE) { + register_binding("base::any", function(..., na.rm = FALSE) { set_agg( fun = "any", data = ensure_one_arg(list2(...), "any"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::all", function(..., na.rm = FALSE) { + register_binding("base::all", function(..., na.rm = FALSE) { set_agg( fun = "all", data = ensure_one_arg(list2(...), "all"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::mean", function(x, na.rm = FALSE) { + register_binding("base::mean", function(x, na.rm = FALSE) { set_agg( fun = "mean", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("stats::sd", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::sd", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "stddev", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg("stats::var", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::var", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "variance", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg( + register_binding( "stats::quantile", function(x, probs, na.rm = FALSE) { if (length(probs) != 1) { @@ -103,7 +103,7 @@ register_bindings_aggregate <- function() { "approximate quantile (t-digest) is computed" ) ) - register_binding_agg( + register_binding( "stats::median", function(x, na.rm = FALSE) { # TODO: Bind to the Arrow function that returns an exact median and remove @@ -122,28 +122,28 @@ register_bindings_aggregate <- function() { }, notes = "approximate median (t-digest) is computed" ) - register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { + register_binding("dplyr::n_distinct", function(..., na.rm = FALSE) { set_agg( fun = "count_distinct", data = ensure_one_arg(list2(...), "n_distinct"), options = list(na.rm = na.rm) ) }) - register_binding_agg("dplyr::n", function() { + register_binding("dplyr::n", function() { set_agg( fun = "count_all", data = list(), options = list() ) }) - register_binding_agg("base::min", function(..., na.rm = FALSE) { + register_binding("base::min", function(..., na.rm = FALSE) { set_agg( fun = "min", data = ensure_one_arg(list2(...), "min"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::max", function(..., na.rm = FALSE) { + register_binding("base::max", function(..., na.rm = FALSE) { set_agg( fun = "max", data = ensure_one_arg(list2(...), "max"), diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index abf2362d0107f..c0eb47e428b7f 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -22,8 +22,8 @@ NULL #' Register compute bindings #' -#' The `register_binding()` and `register_binding_agg()` functions -#' are used to populate a list of functions that operate on (and return) +#' `register_binding()` is used to populate a list of functions that operate on +#' (and return) #' Expressions. These are the basis for the `.data` mask inside dplyr methods. #' #' @section Writing bindings: @@ -40,26 +40,10 @@ NULL #' * Inside your function, you can call any other binding with `call_binding()`. #' #' @param fun_name A string containing a function name in the form `"function"` or -#' `"package::function"`. The package name is currently not used but -#' may be used in the future to allow these types of function calls. -#' @param fun A function or `NULL` to un-register a previous function. +#' `"package::function"`. +#' @param fun A function, or `NULL` to un-register a previous function. #' This function must accept `Expression` objects as arguments and return #' `Expression` objects instead of regular R objects. -#' @param agg_fun An aggregate function or `NULL` to un-register a previous -#' aggregate function. This function must accept `Expression` objects as -#' arguments and return a `list()` with components: -#' - `fun`: string function name -#' - `data`: list of 0 or more `Expression`s -#' - `options`: list of function options, as passed to call_function -#' @param update_cache Update .cache$functions at the time of registration. -#' the default is FALSE because the majority of usage is to register -#' bindings at package load, after which we create the cache once. The -#' reason why .cache$functions is needed in addition to nse_funcs for -#' non-aggregate functions could be revisited...it is currently used -#' as the data mask in mutate, filter, and aggregate (but not -#' summarise) because the data mask has to be a list. -#' @param registry An environment in which the functions should be -#' assigned. #' @param notes string for the docs: note any limitations or differences in #' behavior between the Arrow version and the R function. #' @return The previously registered binding or `NULL` if no previously @@ -67,12 +51,10 @@ NULL #' @keywords internal register_binding <- function(fun_name, fun, - registry = nse_funcs, - update_cache = FALSE, notes = character(0)) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] # if the unqualified name exists in the registry, warn if (!is.null(previous_fun) && !identical(fun, previous_fun)) { @@ -87,58 +69,25 @@ register_binding <- function(fun_name, # register both as `pkg::fun` and as `fun` if `qualified_name` is prefixed # unqualified_name and fun_name will be the same if not prefixed - registry[[unqualified_name]] <- fun - registry[[fun_name]] <- fun - + .cache$functions[[unqualified_name]] <- fun + .cache$functions[[fun_name]] <- fun .cache$docs[[fun_name]] <- notes - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- fun - fun_cache[[fun_name]] <- fun - .cache$functions <- fun_cache - } - invisible(previous_fun) } -unregister_binding <- function(fun_name, registry = nse_funcs, - update_cache = FALSE) { +unregister_binding <- function(fun_name) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] - rm( - list = unique(c(fun_name, unqualified_name)), - envir = registry, - inherits = FALSE - ) - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- NULL - fun_cache[[fun_name]] <- NULL - .cache$functions <- fun_cache - } + .cache$functions[[unqualified_name]] <- NULL + .cache$functions[[fun_name]] <- NULL invisible(previous_fun) } -#' @rdname register_binding -#' @keywords internal -register_binding_agg <- function(fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0)) { - register_binding(fun_name, agg_fun, registry = registry, notes = notes) -} - # Supports functions and tests that call previously-defined bindings call_binding <- function(fun_name, ...) { - nse_funcs[[fun_name]](...) -} - -call_binding_agg <- function(fun_name, ...) { - agg_funcs[[fun_name]](...) + .cache$functions[[fun_name]](...) } create_binding_cache <- function() { @@ -147,7 +96,7 @@ create_binding_cache <- function() { # Register all available Arrow Compute functions, namespaced as arrow_fun. all_arrow_funs <- list_compute_functions() - arrow_funcs <- set_names( + .cache$functions <- set_names( lapply(all_arrow_funs, function(fun) { force(fun) function(...) Expression$create(fun, ...) @@ -155,7 +104,7 @@ create_binding_cache <- function() { paste0("arrow_", all_arrow_funs) ) - # Register bindings into nse_funcs and agg_funcs + # Register bindings into the cache register_bindings_array_function_map() register_bindings_aggregate() register_bindings_conditional() @@ -165,37 +114,17 @@ create_binding_cache <- function() { register_bindings_type() register_bindings_augmented() - # We only create the cache for nse_funcs and not agg_funcs - .cache$functions <- c(as.list(nse_funcs), arrow_funcs) -} - -# environments in the arrow namespace used in the above functions -nse_funcs <- new.env(parent = emptyenv()) -agg_funcs <- new.env(parent = emptyenv()) -.cache <- new.env(parent = emptyenv()) - -# we register 2 versions of the "::" binding - one for use with nse_funcs -# and another one for use with agg_funcs (registered in dplyr-funcs-agg.R) -nse_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) + .cache$functions[["::"]] <- function(lhs, rhs) { + lhs_name <- as.character(substitute(lhs)) + rhs_name <- as.character(substitute(rhs)) - fun_name <- paste0(lhs_name, "::", rhs_name) + fun_name <- paste0(lhs_name, "::", rhs_name) - # if we do not have a binding for pkg::fun, then fall back on to the - # regular pkg::fun function - nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + # if we do not have a binding for pkg::fun, then fall back on to the + # regular pkg::fun function + .cache$functions[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + } } -agg_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) - - fun_name <- paste0(lhs_name, "::", rhs_name) - - # if we do not have a binding for pkg::fun, then fall back on to the - # nse_funcs (useful when we have a regular function inside an aggregating one) - # and then, if searching nse_funcs fails too, fall back to the - # regular `pkg::fun()` function - agg_funcs[[fun_name]] %||% nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] -} +# environment in the arrow namespace used in the above functions +.cache <- new.env(parent = emptyenv()) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 72882b6afd964..f0a8c005676df 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -48,7 +48,7 @@ mutate.arrow_dplyr_query <- function(.data, # Create a mask with aggregation functions in it # If there are any aggregations, we will need to compute them and # and join the results back in, for "window functions" like x - mean(x) - mask <- arrow_mask(out, aggregation = TRUE) + mask <- arrow_mask(out) # Evaluate the mutate expressions results <- list() for (i in seq_along(exprs)) { diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 56de14db6dd44..58ca849152a75 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -84,7 +84,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # and the aggregation functions will pull out those terms and insert into # that list. # nolint end - mask <- arrow_mask(.data, aggregation = TRUE) + mask <- arrow_mask(.data) # We'll collect any transformations after the aggregation here. # summarize_eval() returns NULL when the outer expression is an aggregation, diff --git a/r/R/udf.R b/r/R/udf.R index 922095cceba6a..0415fbac3c9fc 100644 --- a/r/R/udf.R +++ b/r/R/udf.R @@ -95,12 +95,7 @@ register_scalar_function <- function(name, fun, in_type, out_type, body(binding_fun) <- expr_substitute(body(binding_fun), sym("name"), name) environment(binding_fun) <- asNamespace("arrow") - register_binding( - name, - binding_fun, - update_cache = TRUE - ) - + register_binding(name, binding_fun) invisible(NULL) } diff --git a/r/man/register_binding.Rd b/r/man/register_binding.Rd index d10cd733bbe9d..b84cde3b8993a 100644 --- a/r/man/register_binding.Rd +++ b/r/man/register_binding.Rd @@ -2,63 +2,28 @@ % Please edit documentation in R/dplyr-funcs.R \name{register_binding} \alias{register_binding} -\alias{register_binding_agg} \title{Register compute bindings} \usage{ -register_binding( - fun_name, - fun, - registry = nse_funcs, - update_cache = FALSE, - notes = character(0) -) - -register_binding_agg( - fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0) -) +register_binding(fun_name, fun, notes = character(0)) } \arguments{ \item{fun_name}{A string containing a function name in the form \code{"function"} or -\code{"package::function"}. The package name is currently not used but -may be used in the future to allow these types of function calls.} +\code{"package::function"}.} -\item{fun}{A function or \code{NULL} to un-register a previous function. +\item{fun}{A function, or \code{NULL} to un-register a previous function. This function must accept \code{Expression} objects as arguments and return \code{Expression} objects instead of regular R objects.} -\item{registry}{An environment in which the functions should be -assigned.} - -\item{update_cache}{Update .cache$functions at the time of registration. -the default is FALSE because the majority of usage is to register -bindings at package load, after which we create the cache once. The -reason why .cache$functions is needed in addition to nse_funcs for -non-aggregate functions could be revisited...it is currently used -as the data mask in mutate, filter, and aggregate (but not -summarise) because the data mask has to be a list.} - \item{notes}{string for the docs: note any limitations or differences in behavior between the Arrow version and the R function.} - -\item{agg_fun}{An aggregate function or \code{NULL} to un-register a previous -aggregate function. This function must accept \code{Expression} objects as -arguments and return a \code{list()} with components: -\itemize{ -\item \code{fun}: string function name -\item \code{data}: list of 0 or more \code{Expression}s -\item \code{options}: list of function options, as passed to call_function -}} } \value{ The previously registered binding or \code{NULL} if no previously registered function existed. } \description{ -The \code{register_binding()} and \code{register_binding_agg()} functions -are used to populate a list of functions that operate on (and return) +\code{register_binding()} is used to populate a list of functions that operate on +(and return) Expressions. These are the basis for the \code{.data} mask inside dplyr methods. } \section{Writing bindings}{ diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 1e36ea8bd4966..493eac328e5cd 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -325,7 +325,7 @@ test_that("dplyr method not implemented messages", { # This one is more nuanced expect_error( ds %>% filter(int > 6, dbl > max(dbl)), - "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.", + "Expression dbl > max(dbl) not supported in filter() in Arrow\nCall collect() first to pull data into R.", fixed = TRUE ) }) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index bf23685362a82..535bcb70c4cab 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -324,13 +324,14 @@ test_that("Filtering with unsupported functions", { filter( nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg int > 2, # good - pnorm(dbl) > .99 # bad, opaque + pnorm(dbl) > .99 # bad, opaque, but we'll error on the first one before we get here ) %>% collect(), tbl, - warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported in Arrow -\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow -pulling data into R' + warning = paste( + 'In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1,', + "allowNA = TRUE not supported in Arrow; pulling data into R" + ) ) }) diff --git a/r/tests/testthat/test-dplyr-funcs.R b/r/tests/testthat/test-dplyr-funcs.R index 039604a85ee0c..48c5d730f8493 100644 --- a/r/tests/testthat/test-dplyr-funcs.R +++ b/r/tests/testthat/test-dplyr-funcs.R @@ -19,35 +19,25 @@ skip_on_cran() test_that("register_binding()/unregister_binding() works", { - fake_registry <- new.env(parent = emptyenv()) fun1 <- function() NULL fun2 <- function() "Hello" - expect_null(register_binding("some.pkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`some.pkg::some_fun`, fun1) + expect_null(register_binding("some.pkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) + expect_identical(.cache$functions$`some.pkg::some_fun`, fun1) - expect_identical(unregister_binding("some.pkg::some_fun", fake_registry), fun1) - expect_false("some.pkg::some_fun" %in% names(fake_registry)) - expect_false("some_fun" %in% names(fake_registry)) + expect_identical(unregister_binding("some.pkg::some_fun"), fun1) + expect_false("some.pkg::some_fun" %in% names(.cache$functions)) + expect_false("some_fun" %in% names(.cache$functions)) - expect_null(register_binding("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) + expect_null(register_binding("somePkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) expect_warning( - register_binding("some.pkg2::some_fun", fun2, fake_registry), + register_binding("some.pkg2::some_fun", fun2), "A \"some_fun\" binding already exists in the registry and will be overwritten." ) # No warning when an identical function is re-registered - expect_silent(register_binding("some.pkg2::some_fun", fun2, fake_registry)) -}) - -test_that("register_binding_agg() works", { - fake_registry <- new.env(parent = emptyenv()) - fun1 <- function() NULL - - expect_null(register_binding_agg("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`somePkg::some_fun`, fun1) + expect_silent(register_binding("some.pkg2::some_fun", fun2)) }) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 87bb5e5fac959..a61ef95bee73d 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -337,20 +337,20 @@ test_that("Functions that take ... but we only accept a single arg", { ) # Now that we've demonstrated that the whole machinery works, let's test - # the agg_funcs directly - expect_error(call_binding_agg("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("sum"), "sum() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("prod"), "prod() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("any"), "any() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("all"), "all() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("min"), "min() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("max"), "max() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("n_distinct", 1, 2), "Multiple arguments to n_distinct()") - expect_error(call_binding_agg("sum", 1, 2), "Multiple arguments to sum") - expect_error(call_binding_agg("any", 1, 2), "Multiple arguments to any()") - expect_error(call_binding_agg("all", 1, 2), "Multiple arguments to all()") - expect_error(call_binding_agg("min", 1, 2), "Multiple arguments to min()") - expect_error(call_binding_agg("max", 1, 2), "Multiple arguments to max()") + # the agg funcs directly + expect_error(call_binding("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) + expect_error(call_binding("sum"), "sum() with 0 arguments", fixed = TRUE) + expect_error(call_binding("prod"), "prod() with 0 arguments", fixed = TRUE) + expect_error(call_binding("any"), "any() with 0 arguments", fixed = TRUE) + expect_error(call_binding("all"), "all() with 0 arguments", fixed = TRUE) + expect_error(call_binding("min"), "min() with 0 arguments", fixed = TRUE) + expect_error(call_binding("max"), "max() with 0 arguments", fixed = TRUE) + expect_error(call_binding("n_distinct", 1, 2), "Multiple arguments to n_distinct()") + expect_error(call_binding("sum", 1, 2), "Multiple arguments to sum") + expect_error(call_binding("any", 1, 2), "Multiple arguments to any()") + expect_error(call_binding("all", 1, 2), "Multiple arguments to all()") + expect_error(call_binding("min", 1, 2), "Multiple arguments to min()") + expect_error(call_binding("max", 1, 2), "Multiple arguments to max()") }) test_that("median()", { diff --git a/r/tests/testthat/test-udf.R b/r/tests/testthat/test-udf.R index 0eb75b1dde6e5..8604dc610a435 100644 --- a/r/tests/testthat/test-udf.R +++ b/r/tests/testthat/test-udf.R @@ -90,7 +90,7 @@ test_that("register_scalar_function() adds a compute function to the registry", int32(), float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_true("times_32" %in% names(asNamespace("arrow")$.cache$functions)) expect_true("times_32" %in% list_compute_functions()) @@ -124,7 +124,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_array", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_array")) expect_error( call_function("times_32_bad_return_type_array", Array$create(1L)), @@ -137,7 +137,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_scalar", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_scalar")) expect_error( call_function("times_32_bad_return_type_scalar", Array$create(1L)), @@ -155,7 +155,7 @@ test_that("register_scalar_function() can register multiple kernels", { out_type = function(in_types) in_types[[1]], auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_equal( call_function("times_32", Scalar$create(1L, int32())), @@ -238,7 +238,7 @@ test_that("user-defined functions work during multi-threaded execution", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) # check a regular collect() result <- open_dataset(tf_dataset) %>% @@ -271,7 +271,7 @@ test_that("nested exec plans can contain user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) stream_plan_with_udf <- function() { record_batch(a = 1:1000) %>% @@ -310,7 +310,7 @@ test_that("head() on exec plan containing user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) result <- record_batch(a = 1:1000) %>% dplyr::mutate(b = times_32(a)) %>% diff --git a/r/vignettes/developers/writing_bindings.Rmd b/r/vignettes/developers/writing_bindings.Rmd index 443211b3c2b5e..e1ed92105dbc3 100644 --- a/r/vignettes/developers/writing_bindings.Rmd +++ b/r/vignettes/developers/writing_bindings.Rmd @@ -145,11 +145,10 @@ test_that("startsWith behaves identically in dplyr and Arrow", { df <- tibble(x = c("Foo", "bar", "baz", "qux")) compare_dplyr_binding( .input %>% - filter(startsWith(x, "b")) %>% - collect(), + filter(startsWith(x, "b")) %>% + collect(), df ) - }) ``` @@ -197,7 +196,7 @@ As `startsWith()` requires options, direct mapping is not appropriate. If the function cannot be mapped directly, some extra work may be needed to ensure that calling the arrow version of the function results in the same result as calling the R version of the function. In this case, the function will need -adding to the `nse_funcs` function registry. Here is how this might look for +adding to the `.cache$functions` function registry. Here is how this might look for `startsWith()`: ```{r, eval = FALSE} From 9cf0ee722ba048f3f305b38e536fa726eff9c813 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 7 May 2024 23:56:02 +0800 Subject: [PATCH 051/105] GH-41562: [C++][Parquet] Decoding: Fix num_value handling in ByteStreamSplitDecoder (#41565) ### Rationale for this change This problem is raised from https://github.com/apache/arrow/pull/40094 . Original bug fixed here: https://github.com/apache/arrow/pull/34140 , but this is corrupt in https://github.com/apache/arrow/pull/40094 . ### What changes are included in this PR? Refine checking ### Are these changes tested? * [x] Will add ### Are there any user-facing changes? Bugfix * GitHub Issue: #41562 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 22 +++++++++++++++++----- cpp/src/parquet/encoding.h | 5 +++++ cpp/src/parquet/encoding_test.cc | 4 ++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3da5c64ace5dd..05221568c8fa0 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3694,12 +3694,24 @@ class ByteStreamSplitDecoderBase : public DecoderImpl, ByteStreamSplitDecoderBase(const ColumnDescriptor* descr, int byte_width) : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT), byte_width_(byte_width) {} - void SetData(int num_values, const uint8_t* data, int len) override { - if (static_cast(num_values) * byte_width_ != len) { - throw ParquetException("Data size (" + std::to_string(len) + - ") does not match number of values in BYTE_STREAM_SPLIT (" + - std::to_string(num_values) + ")"); + void SetData(int num_values, const uint8_t* data, int len) final { + // Check that the data size is consistent with the number of values + // The spec requires that the data size is a multiple of the number of values, + // see: https://github.com/apache/parquet-format/pull/192 . + // GH-41562: passed in `num_values` may include nulls, so we need to check and + // adjust the number of values. + if (static_cast(num_values) * byte_width_ < len) { + throw ParquetException( + "Data size (" + std::to_string(len) + + ") is too small for the number of values in in BYTE_STREAM_SPLIT (" + + std::to_string(num_values) + ")"); + } + if (len % byte_width_ != 0) { + throw ParquetException("ByteStreamSplit data size " + std::to_string(len) + + " not aligned with type " + TypeToString(DType::type_num) + + " and byte_width: " + std::to_string(byte_width_)); } + num_values = len / byte_width_; DecoderImpl::SetData(num_values, data, len); stride_ = num_values_; } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 602009189595e..493c4044ddc1c 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -255,6 +255,11 @@ class Decoder { // Sets the data for a new page. This will be called multiple times on the same // decoder and should reset all internal state. + // + // `num_values` comes from the data page header, and may be greater than the number of + // physical values in the data buffer if there are some omitted (null) values. + // `len`, on the other hand, is the size in bytes of the data buffer and + // directly relates to the number of physical values. virtual void SetData(int num_values, const uint8_t* data, int len) = 0; // Returns the number of values left (for the last call to SetData()). This is diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index b91fcb0839cba..3c20b917f6994 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1383,7 +1383,7 @@ class TestByteStreamSplitEncoding : public TestEncodingBase { encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); encode_buffer_ = encoder->FlushValues(); ASSERT_EQ(encode_buffer_->size(), physical_byte_width() * (num_values_ - null_count)); - decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, valid_bits, valid_bits_offset); @@ -1717,7 +1717,7 @@ class TestDeltaBitPackEncoding : public TestEncodingBase { for (size_t i = 0; i < kNumRoundTrips; ++i) { encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); encode_buffer_ = encoder->FlushValues(); - decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, valid_bits, valid_bits_offset); From 51689a040cbe3dee8702cd899a33fa62e0616bf1 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 8 May 2024 00:14:22 +0800 Subject: [PATCH 052/105] GH-41545: [C++][Parquet] Fix DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize (#41546) ### Rationale for this change `DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize` would return an wrong estimate when `Put(const Array&)` was called. ### What changes are included in this PR? Remove `encoded_size_` and uses `sink_.length()` as `encoded_size_`. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41545 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 18 ++++++++++-------- cpp/src/parquet/encoding_test.cc | 9 +++++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 05221568c8fa0..004cb746b3a89 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2740,13 +2740,12 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY, pool = ::arrow::default_memory_pool()), sink_(pool), - length_encoder_(nullptr, pool), - encoded_size_{0} {} + length_encoder_(nullptr, pool) {} std::shared_ptr FlushValues() override; int64_t EstimatedDataEncodedSize() override { - return encoded_size_ + length_encoder_.EstimatedDataEncodedSize(); + return sink_.length() + length_encoder_.EstimatedDataEncodedSize(); } using TypedEncoder::Put; @@ -2768,6 +2767,11 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, return Status::Invalid( "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } + if (ARROW_PREDICT_FALSE( + view.size() + sink_.length() > + static_cast(std::numeric_limits::max()))) { + return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + } length_encoder_.Put({static_cast(view.length())}, 1); PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length())); return Status::OK(); @@ -2777,7 +2781,6 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, ::arrow::BufferBuilder sink_; DeltaBitPackEncoder length_encoder_; - uint32_t encoded_size_; }; template @@ -2803,15 +2806,15 @@ void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) { const int batch_size = std::min(kBatchSize, num_values - idx); for (int j = 0; j < batch_size; ++j) { const int32_t len = src[idx + j].len; - if (AddWithOverflow(total_increment_size, len, &total_increment_size)) { + if (ARROW_PREDICT_FALSE( + AddWithOverflow(total_increment_size, len, &total_increment_size))) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } lengths[j] = len; } length_encoder_.Put(lengths.data(), batch_size); } - - if (AddWithOverflow(encoded_size_, total_increment_size, &encoded_size_)) { + if (sink_.length() + total_increment_size > std::numeric_limits::max()) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size)); @@ -2850,7 +2853,6 @@ std::shared_ptr DeltaLengthByteArrayEncoder::FlushValues() { std::shared_ptr buffer; PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true)); - encoded_size_ = 0; return buffer; } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 3c20b917f6994..78bf26587e3fb 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -577,6 +577,11 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); ASSERT_NO_THROW(encoder->Put(*values)); + // For Plain encoding, the estimated size should be at least the total byte size + auto& string_array = dynamic_cast(*values); + EXPECT_GE(encoder->EstimatedDataEncodedSize(), string_array.total_values_length()) + << "Estimated size should be at least the total byte size"; + auto buf = encoder->FlushValues(); int num_values = static_cast(values->length() - values->null_count()); @@ -2160,6 +2165,10 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) { ASSERT_NO_THROW(encoder->Put(*values)); + auto* binary_array = checked_cast(values.get()); + // For DeltaLength encoding, the estimated size should be at least the total byte size + EXPECT_GE(encoder->EstimatedDataEncodedSize(), binary_array->total_values_length()) + << "Estimated size should be at least the total byte size"; auto buf = encoder->FlushValues(); int num_values = static_cast(values->length() - values->null_count()); From 53859262ea988f31ce33a469305251064b5a53b8 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 8 May 2024 09:52:57 +0800 Subject: [PATCH 053/105] GH-41431: [C++][Parquet][Dataset] Fix repeated scan on encrypted dataset (#41550) ### Rationale for this change When parquet dataset is reused to create multiple scanners, `FileMetaData` objects are cached to avoid parsing them again. However, these caused issues on encrypted files since internal file decryptors were no longer created by cached `FileMetaData` objects. ### What changes are included in this PR? Expose file_decryptor from FileMetaData and set it properly. ### Are these changes tested? Yes, modify the test to reproduce the issue and assure fixed. ### Are there any user-facing changes? No. * GitHub Issue: #41431 Authored-by: Gang Wu Signed-off-by: Gang Wu --- .../dataset/file_parquet_encryption_test.cc | 25 +++--- cpp/src/parquet/file_reader.cc | 83 ++++++++++--------- cpp/src/parquet/metadata.cc | 8 ++ cpp/src/parquet/metadata.h | 2 + 4 files changed, 70 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc index 307017fd67e06..0287d593d12d3 100644 --- a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc @@ -148,17 +148,22 @@ class DatasetEncryptionTestBase : public ::testing::Test { FileSystemDatasetFactory::Make(file_system_, selector, file_format, factory_options)); - // Read dataset into table + // Create the dataset ASSERT_OK_AND_ASSIGN(auto dataset, dataset_factory->Finish()); - ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); - ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); - ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable()); - - // Verify the data was read correctly - ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks()); - // Validate the table - ASSERT_OK(combined_table->ValidateFull()); - AssertTablesEqual(*combined_table, *table_); + + // Reuse the dataset above to scan it twice to make sure decryption works correctly. + for (size_t i = 0; i < 2; ++i) { + // Read dataset into table + ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); + ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); + ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable()); + + // Verify the data was read correctly + ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks()); + // Validate the table + ASSERT_OK(combined_table->ValidateFull()); + AssertTablesEqual(*combined_table, *table_); + } } protected: diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b3dd1d6054ac8..8fcb0870ce4b6 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -215,16 +215,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source, int64_t source_size, FileMetaData* file_metadata, int row_group_number, ReaderProperties props, - std::shared_ptr prebuffered_column_chunks_bitmap, - std::shared_ptr file_decryptor = nullptr) + std::shared_ptr prebuffered_column_chunks_bitmap) : source_(std::move(source)), cached_source_(std::move(cached_source)), source_size_(source_size), file_metadata_(file_metadata), properties_(std::move(props)), row_group_ordinal_(row_group_number), - prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)), - file_decryptor_(std::move(file_decryptor)) { + prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -263,10 +261,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // The column is encrypted - std::shared_ptr meta_decryptor = - GetColumnMetaDecryptor(crypto_metadata.get(), file_decryptor_.get()); - std::shared_ptr data_decryptor = - GetColumnDataDecryptor(crypto_metadata.get(), file_decryptor_.get()); + std::shared_ptr meta_decryptor = GetColumnMetaDecryptor( + crypto_metadata.get(), file_metadata_->file_decryptor().get()); + std::shared_ptr data_decryptor = GetColumnDataDecryptor( + crypto_metadata.get(), file_metadata_->file_decryptor().get()); ARROW_DCHECK_NE(meta_decryptor, nullptr); ARROW_DCHECK_NE(data_decryptor, nullptr); @@ -291,7 +289,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { ReaderProperties properties_; int row_group_ordinal_; const std::shared_ptr prebuffered_column_chunks_bitmap_; - std::shared_ptr file_decryptor_; }; // ---------------------------------------------------------------------- @@ -316,7 +313,9 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { - if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); + if (file_metadata_ && file_metadata_->file_decryptor()) { + file_metadata_->file_decryptor()->WipeOutDecryptionKeys(); + } } std::shared_ptr GetRowGroup(int i) override { @@ -330,7 +329,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::unique_ptr contents = std::make_unique( source_, cached_source_, source_size_, file_metadata_.get(), i, properties_, - std::move(prebuffered_column_chunks_bitmap), file_decryptor_); + std::move(prebuffered_column_chunks_bitmap)); return std::make_shared(std::move(contents)); } @@ -346,8 +345,9 @@ class SerializedFile : public ParquetFileReader::Contents { "forget to call ParquetFileReader::Open() first?"); } if (!page_index_reader_) { - page_index_reader_ = PageIndexReader::Make(source_.get(), file_metadata_, - properties_, file_decryptor_.get()); + page_index_reader_ = + PageIndexReader::Make(source_.get(), file_metadata_, properties_, + file_metadata_->file_decryptor().get()); } return page_index_reader_; } @@ -362,8 +362,8 @@ class SerializedFile : public ParquetFileReader::Contents { "forget to call ParquetFileReader::Open() first?"); } if (!bloom_filter_reader_) { - bloom_filter_reader_ = - BloomFilterReader::Make(source_, file_metadata_, properties_, file_decryptor_); + bloom_filter_reader_ = BloomFilterReader::Make(source_, file_metadata_, properties_, + file_metadata_->file_decryptor()); if (bloom_filter_reader_ == nullptr) { throw ParquetException("Cannot create BloomFilterReader"); } @@ -441,10 +441,12 @@ class SerializedFile : public ParquetFileReader::Contents { // Parse the footer depending on encryption type const bool is_encrypted_footer = memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0; + std::shared_ptr file_decryptor; if (is_encrypted_footer) { // Encrypted file with Encrypted footer. const std::pair read_size = - ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len); + ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len, + &file_decryptor); // Read the actual footer metadata_start = read_size.first; metadata_len = read_size.second; @@ -453,8 +455,8 @@ class SerializedFile : public ParquetFileReader::Contents { // Fall through } - const uint32_t read_metadata_len = - ParseUnencryptedFileMetadata(metadata_buffer, metadata_len); + const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( + metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties().get(); if (is_encrypted_footer) { // Nothing else to do here. @@ -550,34 +552,37 @@ class SerializedFile : public ParquetFileReader::Contents { // Parse the footer depending on encryption type const bool is_encrypted_footer = memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0; + std::shared_ptr file_decryptor; if (is_encrypted_footer) { // Encrypted file with Encrypted footer. std::pair read_size; BEGIN_PARQUET_CATCH_EXCEPTIONS - read_size = - ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len); + read_size = ParseMetaDataOfEncryptedFileWithEncryptedFooter( + metadata_buffer, metadata_len, &file_decryptor); END_PARQUET_CATCH_EXCEPTIONS // Read the actual footer int64_t metadata_start = read_size.first; metadata_len = read_size.second; return source_->ReadAsync(metadata_start, metadata_len) - .Then([this, metadata_len, is_encrypted_footer]( + .Then([this, metadata_len, is_encrypted_footer, file_decryptor]( const std::shared_ptr<::arrow::Buffer>& metadata_buffer) { // Continue and read the file footer - return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer); + return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer, + file_decryptor); }); } return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, - is_encrypted_footer); + is_encrypted_footer, std::move(file_decryptor)); } // Continuation - ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer, - uint32_t metadata_len, - const bool is_encrypted_footer) { + ::arrow::Status ParseMetaDataFinal( + std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len, + const bool is_encrypted_footer, + std::shared_ptr file_decryptor) { BEGIN_PARQUET_CATCH_EXCEPTIONS - const uint32_t read_metadata_len = - ParseUnencryptedFileMetadata(metadata_buffer, metadata_len); + const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( + metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties().get(); if (is_encrypted_footer) { // Nothing else to do here. @@ -608,11 +613,11 @@ class SerializedFile : public ParquetFileReader::Contents { // Maps row group ordinal and prebuffer status of its column chunks in the form of a // bitmap buffer. std::unordered_map> prebuffered_column_chunks_; - std::shared_ptr file_decryptor_; // \return The true length of the metadata in bytes - uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, - const uint32_t metadata_len); + uint32_t ParseUnencryptedFileMetadata( + const std::shared_ptr& footer_buffer, const uint32_t metadata_len, + std::shared_ptr file_decryptor); std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo); @@ -624,11 +629,13 @@ class SerializedFile : public ParquetFileReader::Contents { // \return The position and size of the actual footer std::pair ParseMetaDataOfEncryptedFileWithEncryptedFooter( - const std::shared_ptr& crypto_metadata_buffer, uint32_t footer_len); + const std::shared_ptr& crypto_metadata_buffer, uint32_t footer_len, + std::shared_ptr* file_decryptor); }; uint32_t SerializedFile::ParseUnencryptedFileMetadata( - const std::shared_ptr& metadata_buffer, const uint32_t metadata_len) { + const std::shared_ptr& metadata_buffer, const uint32_t metadata_len, + std::shared_ptr file_decryptor) { if (metadata_buffer->size() != metadata_len) { throw ParquetException("Failed reading metadata buffer (requested " + std::to_string(metadata_len) + " bytes but got " + @@ -637,7 +644,7 @@ uint32_t SerializedFile::ParseUnencryptedFileMetadata( uint32_t read_metadata_len = metadata_len; // The encrypted read path falls through to here, so pass in the decryptor file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, - properties_, file_decryptor_); + properties_, std::move(file_decryptor)); return read_metadata_len; } @@ -645,7 +652,7 @@ std::pair SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer, // both metadata & crypto metadata length - const uint32_t footer_len) { + const uint32_t footer_len, std::shared_ptr* file_decryptor) { // encryption with encrypted footer // Check if the footer_buffer contains the entire metadata if (crypto_metadata_buffer->size() != footer_len) { @@ -664,7 +671,7 @@ SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( // Handle AAD prefix EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_ = std::make_shared( + *file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_crypto_metadata->key_metadata(), properties_.memory_pool()); @@ -683,12 +690,12 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); // Handle AAD prefix std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_ = std::make_shared( + auto file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata(), properties_.memory_pool()); // set the InternalFileDecryptor in the metadata as well, as it's used // for signature verification and for ColumnChunkMetaData creation. - file_metadata_->set_file_decryptor(file_decryptor_); + file_metadata_->set_file_decryptor(std::move(file_decryptor)); if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 3f101b5ae3ac6..b24883cdc160b 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -826,6 +826,10 @@ class FileMetaData::FileMetaDataImpl { file_decryptor_ = std::move(file_decryptor); } + const std::shared_ptr& file_decryptor() const { + return file_decryptor_; + } + private: friend FileMetaDataBuilder; uint32_t metadata_len_ = 0; @@ -947,6 +951,10 @@ void FileMetaData::set_file_decryptor( impl_->set_file_decryptor(std::move(file_decryptor)); } +const std::shared_ptr& FileMetaData::file_decryptor() const { + return impl_->file_decryptor(); +} + ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { case 1: diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 640b898024346..9fc30df58e0d3 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -399,12 +399,14 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; friend class SerializedFile; + friend class SerializedRowGroup; explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, const ReaderProperties& properties, std::shared_ptr file_decryptor = NULLPTR); void set_file_decryptor(std::shared_ptr file_decryptor); + const std::shared_ptr& file_decryptor() const; // PIMPL Idiom FileMetaData(); From d83af8f749ee560c0b04d986ba2912e696e1cd68 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 8 May 2024 12:57:10 +0200 Subject: [PATCH 054/105] GH-38770: [C++][Python] RecordBatch.filter() segfaults if passed a ChunkedArray (#40971) ### Rationale for this change Filtering a record batch with a boolean mask in the form of a `ChunkedArray` results in a segmentation fault. ### What changes are included in this PR? In case chunked array is passed as a mask to filter record batch, the code path for `pa.Table.filter()` is taken resulting in a filtered table. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #38770 Authored-by: AlenkaF Signed-off-by: AlenkaF --- .../vector_selection_filter_internal.cc | 26 ++++++++++++++----- python/pyarrow/tests/test_compute.py | 5 ++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index d5e5e5ad289ac..8d43c65668d4b 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -22,6 +22,7 @@ #include #include +#include "arrow/array/concatenate.h" #include "arrow/array/data.h" #include "arrow/buffer_builder.h" #include "arrow/chunked_array.h" @@ -928,12 +929,26 @@ Result> FilterRecordBatch(const RecordBatch& batch, return Status::Invalid("Filter inputs must all be the same length"); } - // Convert filter to selection vector/indices and use Take + // Fetch filter const auto& filter_opts = *static_cast(options); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr indices, - GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior, - ctx->memory_pool())); + ArrayData filter_array; + switch (filter.kind()) { + case Datum::ARRAY: + filter_array = *filter.array(); + break; + case Datum::CHUNKED_ARRAY: { + ARROW_ASSIGN_OR_RAISE(auto combined, Concatenate(filter.chunked_array()->chunks())); + filter_array = *combined->data(); + break; + } + default: + return Status::TypeError("Filter should be array-like"); + } + + // Convert filter to selection vector/indices and use Take + ARROW_ASSIGN_OR_RAISE(std::shared_ptr indices, + GetTakeIndices(filter_array, filter_opts.null_selection_behavior, + ctx->memory_pool())); std::vector> columns(batch.num_columns()); for (int i = 0; i < batch.num_columns(); ++i) { ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices), @@ -1042,7 +1057,6 @@ class FilterMetaFunction : public MetaFunction { } if (args[0].kind() == Datum::RECORD_BATCH) { - auto values_batch = args[0].record_batch(); ARROW_ASSIGN_OR_RAISE( std::shared_ptr out_batch, FilterRecordBatch(*args[0].record_batch(), args[1], options, ctx)); diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 17cc546f834ca..d7dee1ad05e93 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1345,6 +1345,11 @@ def test_filter_record_batch(): expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"]) assert result.equals(expected) + # GH-38770: mask is chunked array + chunked_mask = pa.chunked_array([[True, False], [None], [False, True]]) + result = batch.filter(chunked_mask) + assert result.equals(expected) + result = batch.filter(mask, null_selection_behavior="emit_null") expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"]) assert result.equals(expected) From e21952f969cd9d0906a86898f561088606447359 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 8 May 2024 13:47:21 +0200 Subject: [PATCH 055/105] GH-40750: [C++][Python] Map child Array constructed from keys and items shouldn't have offset (#40871) ### Rationale for this change When `MapArray` is constructed from `keys` and `items` array the offset of the list offsets is passed down to the struct child array which is not correct. ### What changes are included in this PR? This PR fixes this issue. ### Are these changes tested? Yes. ### Are there any user-facing changes? Shouldn't be. * GitHub Issue: #40750 Authored-by: AlenkaF Signed-off-by: AlenkaF --- cpp/src/arrow/array/array_list_test.cc | 16 +++++++++++++++- cpp/src/arrow/array/array_nested.cc | 2 +- python/pyarrow/tests/test_array.py | 24 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 18afcc90d71f8..e79ce6fe172b2 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1287,7 +1287,7 @@ TEST_F(TestMapArray, ValidateErrorNullKey) { } TEST_F(TestMapArray, FromArrays) { - std::shared_ptr offsets1, offsets2, offsets3, offsets4, keys, items; + std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, keys, items; std::vector offsets_is_valid3 = {true, false, true, true}; std::vector offsets_is_valid4 = {true, true, false, true}; @@ -1342,6 +1342,20 @@ TEST_F(TestMapArray, FromArrays) { // Zero-length offsets ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1->Slice(0, 0), keys, items, pool_)); + // Offseted offsets + ASSERT_OK_AND_ASSIGN(auto map5, + MapArray::FromArrays(offsets1->Slice(1), keys, items, pool_)); + ASSERT_OK(map5->Validate()); + + AssertArraysEqual(*expected1.Slice(1), *map5); + + std::vector offset5_values = {2, 2, 6}; + ArrayFromVector(offset5_values, &offsets5); + ASSERT_OK_AND_ASSIGN(auto map6, MapArray::FromArrays(offsets5, keys, items, pool_)); + ASSERT_OK(map6->Validate()); + + AssertArraysEqual(*map5, *map6); + // Offsets not the right type ASSERT_RAISES(TypeError, MapArray::FromArrays(keys, offsets1, items, pool_)); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 24e0dfb7081ac..1be771d8228d9 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -790,7 +790,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& items, int64_t null_count, int64_t offset) { auto pair_data = ArrayData::Make(type->fields()[0]->type(), keys->data()->length, - {nullptr}, {keys->data(), items->data()}, 0, offset); + {nullptr}, {keys->data(), items->data()}, 0); auto map_data = ArrayData::Make(type, length, std::move(buffers), {pair_data}, null_count, offset); SetData(map_data); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 3754daeb9b4bd..dbe29c5730758 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1099,6 +1099,30 @@ def test_map_from_arrays(): with pytest.raises(ValueError): pa.MapArray.from_arrays(offsets, keys_with_null, items) + # Check if offset in offsets > 0 + offsets = pa.array(offsets, pa.int32()) + result = pa.MapArray.from_arrays(offsets.slice(1), keys, items) + expected = pa.MapArray.from_arrays([1, 3, 5], keys, items) + + assert result.equals(expected) + assert result.offset == 1 + assert expected.offset == 0 + + offsets = pa.array([0, 0, 0, 0, 0, 0], pa.int32()) + result = pa.MapArray.from_arrays( + offsets.slice(1), + pa.array([], pa.string()), + pa.array([], pa.string()), + ) + expected = pa.MapArray.from_arrays( + [0, 0, 0, 0, 0], + pa.array([], pa.string()), + pa.array([], pa.string()), + ) + assert result.equals(expected) + assert result.offset == 1 + assert expected.offset == 0 + def test_fixed_size_list_from_arrays(): values = pa.array(range(12), pa.int64()) From f462ec7e6b85aef3d84b777bc577441f4e10b214 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 10:54:12 -0400 Subject: [PATCH 056/105] MINOR: [Go] Bump golang.org/x/sys from 0.19.0 to 0.20.0 in /go (#41554) Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.19.0 to 0.20.0.
Commits
  • 7d69d98 unix: extend support for z/OS
  • 7758090 cpu: add support for sve2 detection
  • 9a28524 windows: drop go version tags for unsupported versions
  • 27dc90b unix: update to Linux kernel 6.4
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/sys&package-manager=go_modules&previous-version=0.19.0&new-version=0.20.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 35fd9b9915c0b..188e5c6180ff1 100644 --- a/go/go.mod +++ b/go/go.mod @@ -36,7 +36,7 @@ require ( github.com/zeebo/xxh3 v1.0.2 golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 golang.org/x/sync v0.7.0 - golang.org/x/sys v0.19.0 + golang.org/x/sys v0.20.0 golang.org/x/tools v0.20.0 golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 diff --git a/go/go.sum b/go/go.sum index bf33fed6c4c97..998b3cd8bbcc5 100644 --- a/go/go.sum +++ b/go/go.sum @@ -124,8 +124,8 @@ golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.20.0 h1:hz/CVckiOxybQvFw6h7b/q80NTr9IUQb4s1IIzW7KNY= From f6720276543844ad53dece91a9350b0a821e52d3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 10:54:32 -0400 Subject: [PATCH 057/105] MINOR: [Go] Bump google.golang.org/protobuf from 1.34.0 to 1.34.1 in /go (#41553) Bumps google.golang.org/protobuf from 1.34.0 to 1.34.1. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=google.golang.org/protobuf&package-manager=go_modules&previous-version=1.34.0&new-version=1.34.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 188e5c6180ff1..7c14ddcf9e216 100644 --- a/go/go.mod +++ b/go/go.mod @@ -41,7 +41,7 @@ require ( golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 google.golang.org/grpc v1.63.2 - google.golang.org/protobuf v1.34.0 + google.golang.org/protobuf v1.34.1 modernc.org/sqlite v1.29.6 ) diff --git a/go/go.sum b/go/go.sum index 998b3cd8bbcc5..70e3a533d03f3 100644 --- a/go/go.sum +++ b/go/go.sum @@ -138,8 +138,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/protobuf v1.34.0 h1:Qo/qEd2RZPCf2nKuorzksSknv0d3ERwp1vFG38gSmH4= -google.golang.org/protobuf v1.34.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From 304650145689291eb87db5dd58f7b9776bdfaacf Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Wed, 8 May 2024 11:42:41 -0400 Subject: [PATCH 058/105] GH-41435: [CI][MATLAB] Add job to build and test MATLAB Interface on `macos-14` (#41592) ### Rationale for this change Currently, the MATLAB interface is built and tested on `macos-12` - not `macos-14` - because the version of `mathworks/libmexclass` depends on used to not support `macos-14`. However, now that https://github.com/apache/arrow/issues/41400 is closed, the version of `mathworks/libmexclass` the MATLAB interface depends on works on `macos-14`, so we will be able to build and test the MATLAB interface on `macos-14`. **Note**: When adding support for ARM-based macOS builds, we discovered an issue with the way in which we package the MLTBX files for the MATLAB Interface to Arrow. Currently, we bundle all shared libraries for all platforms (.dll, .dylib, and .so) into one large "monolithic" MLTBX file. Unfortunately, putting all platform-specific files into one MLTBX file poses an issue when we support multiple ISAs (e.g. x86 and ARM) because builds for the same operating system with different ISAs will have the same shared library file names. In other words, we will have a library named libarrowproxy.dylib for both ARM and x86 macOS builds. Therefore, we are going to hold off on adding ARM-based macOS builds to the crossbow packaging workflow for now until we have a chance to properly explore alternative packaging approaches. For example, we may want to consider having platform-specific MLTBX files. However, we still think it is worthwhile to add CI support for `macos-14` in the meantime. ### What changes are included in this PR? 1. Added workflow to build and test the MATLAB interface on `macos-14` as well as `macos-12`. ### Are these changes tested? N/A. ### Are there any user-facing changes? No. ### Future Directions 1. Add crossbow packaging workflow on `macos-14` once we determine how to package the interface for both ARM-based and Intel-based mac ISAs. * GitHub Issue: #41435 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- .github/workflows/matlab.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 2ae33d1e8d6c6..ca8280927f4a5 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -98,9 +98,16 @@ jobs: select-by-folder: matlab/test strict: true macos: - name: AMD64 macOS 12 MATLAB - runs-on: macos-12 + name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} MATLAB + runs-on: macos-${{ matrix.macos-version }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + strategy: + matrix: + include: + - architecture: AMD64 + macos-version: "12" + - architecture: ARM64 + macos-version: "14" steps: - name: Check out repository uses: actions/checkout@v4 From 5252c6ce13694fa31dbcb2623d1629cd8fe53a47 Mon Sep 17 00:00:00 2001 From: Alex Shcherbakov Date: Wed, 8 May 2024 22:46:45 +0300 Subject: [PATCH 059/105] GH-41594: [Go] Support reading `date64` type & properly validate list-like types (#41595) This PR includes 2 fixes: 1. support reading `date64` columns (as write is supported) 2. properly validate list-like data types (list of unsupported is unsupported) ### Rationale for this change See #41594 ### What changes are included in this PR? 1. Added `date64` reading & conversion funcs similar to `date32` 2. Refactored date type validation ### Are these changes tested? a55cd5324d2c47932410b0c7a9c46075386645d2 ### Are there any user-facing changes? No. * GitHub Issue: #41594 Authored-by: candiduslynx Signed-off-by: Matt Topol --- go/arrow/csv/common.go | 40 ++++++++++------- go/arrow/csv/reader.go | 74 +++++++++++++------------------- go/arrow/csv/reader_test.go | 8 ++++ go/arrow/csv/testdata/header.csv | 8 ++-- go/arrow/csv/testdata/types.csv | 8 ++-- go/arrow/csv/transformer.go | 69 +++++++---------------------- 6 files changed, 86 insertions(+), 121 deletions(-) diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 4455c8b782167..06fed69a77fe5 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -239,21 +239,31 @@ func WithStringsReplacer(replacer *strings.Replacer) Option { func validate(schema *arrow.Schema) { for i, f := range schema.Fields() { - switch ft := f.Type.(type) { - case *arrow.BooleanType: - case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type: - case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type: - case *arrow.Float16Type, *arrow.Float32Type, *arrow.Float64Type: - case *arrow.StringType, *arrow.LargeStringType: - case *arrow.TimestampType: - case *arrow.Date32Type, *arrow.Date64Type: - case *arrow.Decimal128Type, *arrow.Decimal256Type: - case *arrow.ListType, *arrow.LargeListType, *arrow.FixedSizeListType: - case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: - case arrow.ExtensionType: - case *arrow.NullType: - default: - panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft)) + if !typeSupported(f.Type) { + panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, f.Type)) } } } + +func typeSupported(dt arrow.DataType) bool { + switch dt := dt.(type) { + case *arrow.BooleanType: + case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type: + case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type: + case *arrow.Float16Type, *arrow.Float32Type, *arrow.Float64Type: + case *arrow.StringType, *arrow.LargeStringType: + case *arrow.TimestampType: + case *arrow.Date32Type, *arrow.Date64Type: + case *arrow.Decimal128Type, *arrow.Decimal256Type: + case *arrow.MapType: + return false + case arrow.ListLikeType: + return typeSupported(dt.Elem()) + case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: + case arrow.ExtensionType: + case *arrow.NullType: + default: + return false + } + return true +} diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go index 18f1083e6a9dc..46591a9a5adee 100644 --- a/go/arrow/csv/reader.go +++ b/go/arrow/csv/reader.go @@ -474,6 +474,10 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { return func(str string) { r.parseDate32(bldr, str) } + case *arrow.Date64Type: + return func(str string) { + r.parseDate64(bldr, str) + } case *arrow.Time32Type: return func(str string) { r.parseTime32(bldr, str, dt.Unit) @@ -486,17 +490,13 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { return func(str string) { r.parseDecimal256(bldr, str, dt.Precision, dt.Scale) } - case *arrow.ListType: - return func(s string) { - r.parseList(bldr, s) - } - case *arrow.LargeListType: + case *arrow.FixedSizeListType: return func(s string) { - r.parseLargeList(bldr, s) + r.parseFixedSizeList(bldr.(*array.FixedSizeListBuilder), s, int(dt.Len())) } - case *arrow.FixedSizeListType: + case arrow.ListLikeType: return func(s string) { - r.parseFixedSizeList(bldr, s, int(dt.Len())) + r.parseListLike(bldr.(array.ListLikeBuilder), s) } case *arrow.BinaryType: return func(s string) { @@ -740,81 +740,67 @@ func (r *Reader) parseDate32(field array.Builder, str string) { field.(*array.Date32Builder).Append(arrow.Date32FromTime(tm)) } -func (r *Reader) parseTime32(field array.Builder, str string, unit arrow.TimeUnit) { +func (r *Reader) parseDate64(field array.Builder, str string) { if r.isNull(str) { field.AppendNull() return } - val, err := arrow.Time32FromString(str, unit) + tm, err := time.Parse("2006-01-02", str) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Time32Builder).Append(val) + field.(*array.Date64Builder).Append(arrow.Date64FromTime(tm)) } -func (r *Reader) parseDecimal128(field array.Builder, str string, prec, scale int32) { +func (r *Reader) parseTime32(field array.Builder, str string, unit arrow.TimeUnit) { if r.isNull(str) { field.AppendNull() return } - val, err := decimal128.FromString(str, prec, scale) + val, err := arrow.Time32FromString(str, unit) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Decimal128Builder).Append(val) + field.(*array.Time32Builder).Append(val) } -func (r *Reader) parseDecimal256(field array.Builder, str string, prec, scale int32) { +func (r *Reader) parseDecimal128(field array.Builder, str string, prec, scale int32) { if r.isNull(str) { field.AppendNull() return } - val, err := decimal256.FromString(str, prec, scale) + val, err := decimal128.FromString(str, prec, scale) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Decimal256Builder).Append(val) + field.(*array.Decimal128Builder).Append(val) } -func (r *Reader) parseList(field array.Builder, str string) { +func (r *Reader) parseDecimal256(field array.Builder, str string, prec, scale int32) { if r.isNull(str) { field.AppendNull() return } - if !(strings.HasPrefix(str, "{") && strings.HasSuffix(str, "}")) { - r.err = errors.New("invalid list format. should start with '{' and end with '}'") - return - } - str = strings.Trim(str, "{}") - listBldr := field.(*array.ListBuilder) - listBldr.Append(true) - if len(str) == 0 { - // we don't want to create the csv reader if we already know the - // string is empty - return - } - valueBldr := listBldr.ValueBuilder() - reader := csv.NewReader(strings.NewReader(str)) - items, err := reader.Read() - if err != nil { + + val, err := decimal256.FromString(str, prec, scale) + if err != nil && r.err == nil { r.err = err + field.AppendNull() return } - for _, str := range items { - r.initFieldConverter(valueBldr)(str) - } + field.(*array.Decimal256Builder).Append(val) } -func (r *Reader) parseLargeList(field array.Builder, str string) { +func (r *Reader) parseListLike(field array.ListLikeBuilder, str string) { if r.isNull(str) { field.AppendNull() return @@ -824,14 +810,13 @@ func (r *Reader) parseLargeList(field array.Builder, str string) { return } str = strings.Trim(str, "{}") - largeListBldr := field.(*array.LargeListBuilder) - largeListBldr.Append(true) + field.Append(true) if len(str) == 0 { // we don't want to create the csv reader if we already know the // string is empty return } - valueBldr := largeListBldr.ValueBuilder() + valueBldr := field.ValueBuilder() reader := csv.NewReader(strings.NewReader(str)) items, err := reader.Read() if err != nil { @@ -843,7 +828,7 @@ func (r *Reader) parseLargeList(field array.Builder, str string) { } } -func (r *Reader) parseFixedSizeList(field array.Builder, str string, n int) { +func (r *Reader) parseFixedSizeList(field *array.FixedSizeListBuilder, str string, n int) { if r.isNull(str) { field.AppendNull() return @@ -853,14 +838,13 @@ func (r *Reader) parseFixedSizeList(field array.Builder, str string, n int) { return } str = strings.Trim(str, "{}") - fixedSizeListBldr := field.(*array.FixedSizeListBuilder) - fixedSizeListBldr.Append(true) + field.Append(true) if len(str) == 0 { // we don't want to create the csv reader if we already know the // string is empty return } - valueBldr := fixedSizeListBldr.ValueBuilder() + valueBldr := field.ValueBuilder() reader := csv.NewReader(strings.NewReader(str)) items, err := reader.Read() if err != nil { diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go index b6654dd1984ea..65453db015a7e 100644 --- a/go/arrow/csv/reader_test.go +++ b/go/arrow/csv/reader_test.go @@ -357,6 +357,8 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool, stringsCanBeN {Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary}, {Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}}, {Name: "uuid", Type: types.NewUUIDType()}, + {Name: "date32", Type: arrow.PrimitiveTypes.Date32}, + {Name: "date64", Type: arrow.PrimitiveTypes.Date64}, }, nil, ) @@ -420,6 +422,8 @@ rec[0]["binary"]: ["\x00\x01\x02"] rec[0]["large_binary"]: ["\x00\x01\x02"] rec[0]["fixed_size_binary"]: ["\x00\x01\x02"] rec[0]["uuid"]: ["00000000-0000-0000-0000-000000000001"] +rec[0]["date32"]: [19121] +rec[0]["date64"]: [1652054400000] rec[1]["bool"]: [false] rec[1]["i8"]: [-2] rec[1]["i16"]: [-2] @@ -442,6 +446,8 @@ rec[1]["binary"]: [(null)] rec[1]["large_binary"]: [(null)] rec[1]["fixed_size_binary"]: [(null)] rec[1]["uuid"]: ["00000000-0000-0000-0000-000000000002"] +rec[1]["date32"]: [19121] +rec[1]["date64"]: [1652054400000] rec[2]["bool"]: [(null)] rec[2]["i8"]: [(null)] rec[2]["i16"]: [(null)] @@ -464,6 +470,8 @@ rec[2]["binary"]: [(null)] rec[2]["large_binary"]: [(null)] rec[2]["fixed_size_binary"]: [(null)] rec[2]["uuid"]: [(null)] +rec[2]["date32"]: [(null)] +rec[2]["date64"]: [(null)] `, str1Value, str1Value, str2Value, str2Value) got, want := out.String(), want require.Equal(t, want, got) diff --git a/go/arrow/csv/testdata/header.csv b/go/arrow/csv/testdata/header.csv index 50be4f5e4daca..68ae18a499dee 100644 --- a/go/arrow/csv/testdata/header.csv +++ b/go/arrow/csv/testdata/header.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;large_str;ts;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;large_str;ts;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid;date32;date64 +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001;2022-05-09;2022-05-09 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002;2022-05-09;2022-05-09 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/testdata/types.csv b/go/arrow/csv/testdata/types.csv index d32941f4b214d..91c0cf3b252b3 100644 --- a/go/arrow/csv/testdata/types.csv +++ b/go/arrow/csv/testdata/types.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;large_string;timestamp;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;large_string;timestamp;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid;date32;date64 +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001;2022-05-09;2022-05-09 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002;2022-05-09;2022-05-09 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go index 90c26ac981078..237437c0441e1 100644 --- a/go/arrow/csv/transformer.go +++ b/go/arrow/csv/transformer.go @@ -29,7 +29,7 @@ import ( "github.com/apache/arrow/go/v17/arrow/array" ) -func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string)string) []string { +func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string) string) []string { res := make([]string, col.Len()) switch typ.(type) { case *arrow.BooleanType: @@ -215,62 +215,25 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, st res[i] = w.nullValue } } - case *arrow.ListType: - arr := col.(*array.List) - listVals, offsets := arr.ListValues(), arr.Offsets() - for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64(offsets[i]), int64(offsets[i+1])) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { - res[i] = w.nullValue - } - } - case *arrow.LargeListType: - arr := col.(*array.LargeList) - listVals, offsets := arr.ListValues(), arr.Offsets() - for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64(offsets[i]), int64(offsets[i+1])) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { - res[i] = w.nullValue - } - } - case *arrow.FixedSizeListType: - arr := col.(*array.FixedSizeList) + case arrow.ListLikeType: + arr := col.(array.ListLike) listVals := arr.ListValues() for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64((arr.Len()-1)*i), int64((arr.Len()-1)*(i+1))) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { + if arr.IsNull(i) { res[i] = w.nullValue + continue } + start, end := arr.ValueOffsets(i) + list := array.NewSlice(listVals, start, end) + var b bytes.Buffer + b.Write([]byte{'{'}) + writer := csv.NewWriter(&b) + writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) + writer.Flush() + b.Truncate(b.Len() - 1) + b.Write([]byte{'}'}) + res[i] = b.String() + list.Release() } case *arrow.BinaryType: arr := col.(*array.Binary) From 318d22adda3b66bd4a10fddc7789c8a13e4aa540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 8 May 2024 22:05:23 +0200 Subject: [PATCH 060/105] MINOR: [Dev] Remove Dane from collaborators list (#41589) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Dane is a new committer. ### What changes are included in this PR? Remove the collaborator list. ### Are these changes tested? Not required. ### Are there any user-facing changes? No Authored-by: Raúl Cumplido Signed-off-by: Dane Pitkin --- .asf.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.asf.yaml b/.asf.yaml index 1eb019fea9af1..a1c6434587703 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -21,7 +21,6 @@ github: collaborators: - anjakefala - benibus - - danepitkin - davisusanibar - jbonofre - js8544 From 46e78160933d039991cedfabb9216dc4c861fb4b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 9 May 2024 06:29:46 +0900 Subject: [PATCH 061/105] GH-41430: [Docs] Use sphinxcontrib-mermaid instead of generating images from .mmd (#41455) ### Rationale for this change This is for easy to maintain. ### What changes are included in this PR? * Install sphinxcontrib-mermaid * Install Chromium to generate SVG from .mmd * Use Debian instead of Ubuntu for building docs because Ubuntu provides Chromium only via snap * Use a normal user not root to build documents because Mermaid require additional `--no-sandbox` argument when we use root ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41430 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/docs.yml | 13 ++-- .github/workflows/docs_light.yml | 2 +- ci/conda_env_sphinx.txt | 1 + ci/docker/linux-apt-docs.dockerfile | 60 ++++++++++++------- ci/scripts/cpp_build.sh | 13 ++-- ci/scripts/integration_arrow.sh | 2 + ci/scripts/java_build.sh | 13 +++- ci/scripts/java_cdata_integration.sh | 4 +- ci/scripts/js_build.sh | 19 ++++-- ci/scripts/js_test.sh | 3 +- ci/scripts/python_build.sh | 33 ++++++++-- ci/scripts/r_build.sh | 20 ++++++- dev/archery/archery/docker/core.py | 4 ++ .../archery/integration/tester_java.py | 16 +++-- dev/archery/archery/integration/tester_js.py | 8 ++- dev/tasks/tasks.yml | 12 ++-- docker-compose.yml | 39 +++++++----- docs/requirements.txt | 3 +- docs/source/conf.py | 8 ++- docs/source/format/Flight.rst | 20 +++---- docs/source/format/Flight/DoExchange.mmd | 3 - docs/source/format/Flight/DoExchange.mmd.svg | 1 - docs/source/format/Flight/DoGet.mmd | 3 - docs/source/format/Flight/DoGet.mmd.svg | 1 - docs/source/format/Flight/DoPut.mmd | 3 - docs/source/format/Flight/DoPut.mmd.svg | 1 - docs/source/format/Flight/PollFlightInfo.mmd | 3 - .../format/Flight/PollFlightInfo.mmd.svg | 1 - docs/source/format/FlightSql.rst | 20 +++---- .../format/FlightSql/CommandGetTables.mmd | 3 - .../format/FlightSql/CommandGetTables.mmd.svg | 1 - .../CommandPreparedStatementQuery.mmd | 3 - .../CommandPreparedStatementQuery.mmd.svg | 1 - .../FlightSql/CommandStatementIngest.mmd | 3 - .../FlightSql/CommandStatementIngest.mmd.svg | 1 - .../FlightSql/CommandStatementQuery.mmd | 3 - .../FlightSql/CommandStatementQuery.mmd.svg | 1 - 37 files changed, 210 insertions(+), 135 deletions(-) delete mode 100644 docs/source/format/Flight/DoExchange.mmd.svg delete mode 100644 docs/source/format/Flight/DoGet.mmd.svg delete mode 100644 docs/source/format/Flight/DoPut.mmd.svg delete mode 100644 docs/source/format/Flight/PollFlightInfo.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandGetTables.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandStatementIngest.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandStatementQuery.mmd.svg diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index fe49e275d908d..36a0dc014db8d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,12 +32,12 @@ env: jobs: complete: - name: AMD64 Ubuntu 22.04 Complete Documentation + name: AMD64 Debian 12 Complete Documentation runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 150 env: - UBUNTU: "22.04" + JDK: 17 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -50,8 +50,8 @@ jobs: uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 with: path: .docker - key: ubuntu-docs-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-docs- + key: debian-docs-${{ hashFiles('cpp/**') }} + restore-keys: debian-docs- - name: Setup Python uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: @@ -62,7 +62,8 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run ubuntu-docs + JDK: 17 + run: archery docker run debian-docs - name: Docker Push if: >- success() && @@ -73,4 +74,4 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true - run: archery docker push ubuntu-docs + run: archery docker push debian-docs diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 376c87651d2d0..947e2ac21b83c 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -31,7 +31,7 @@ on: permissions: contents: read - + env: ARCHERY_DEBUG: 1 ARCHERY_USE_DOCKER_CLI: 1 diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 83afa69a653a9..4665a32e24bbe 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -28,6 +28,7 @@ sphinx-design sphinx-copybutton sphinx-lint sphinxcontrib-jquery +sphinxcontrib-mermaid sphinx==6.2 # Requirement for doctest-cython # Needs upper pin of 0.3.0, see: diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index ec424b4e6eaa0..1c916840e071b 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -21,18 +21,34 @@ FROM ${base} ARG r=4.4 ARG jdk=8 -# See R install instructions at https://cloud.r-project.org/bin/linux/ubuntu/ +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium + +# See R install instructions at https://cloud.r-project.org/bin/linux/ RUN apt-get update -y && \ apt-get install -y \ - dirmngr \ apt-transport-https \ - software-properties-common && \ - wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ - tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ + dirmngr \ + gpg \ + lsb-release && \ + gpg --keyserver keyserver.ubuntu.com \ + --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ + gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + gpg --no-default-keyring \ + --keyring /usr/share/keyrings/cran.gpg \ + --import - && \ + echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + tee /etc/apt/sources.list.d/cran.list && \ + if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i \ + -e 's/main$/main contrib non-free non-free-firmware/g' \ + /etc/apt/sources.list.d/debian.sources; \ + fi && \ + apt-get update -y && \ apt-get install -y --no-install-recommends \ autoconf-archive \ automake \ + chromium \ + chromium-sandbox \ curl \ doxygen \ gi-docgen \ @@ -48,6 +64,8 @@ RUN apt-get update -y && \ libxml2-dev \ meson \ ninja-build \ + nodejs \ + npm \ nvidia-cuda-toolkit \ openjdk-${jdk}-jdk-headless \ pandoc \ @@ -55,9 +73,12 @@ RUN apt-get update -y && \ r-base=${r}* \ rsync \ ruby-dev \ + sudo \ wget && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ + PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ + npm install -g yarn @mermaid-js/mermaid-cli ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 @@ -68,20 +89,6 @@ RUN /arrow/ci/scripts/util_download_apache.sh \ ENV PATH=/opt/apache-maven-${maven}/bin:$PATH RUN mvn -version -ARG node=16 -RUN apt-get purge -y npm && \ - apt-get autoremove -y --purge && \ - wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ - apt-get install -y nodejs && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - npm install -g yarn - -COPY docs/requirements.txt /arrow/docs/ -RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ - . ${ARROW_PYTHON_VENV}/bin/activate && \ - pip install -r arrow/docs/requirements.txt - COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ bundle install --gemfile /arrow/c_glib/Gemfile @@ -98,6 +105,17 @@ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ R -e "install.packages('pkgdown')" +RUN useradd --user-group --create-home --groups audio,video arrow +RUN echo "arrow ALL=(ALL:ALL) NOPASSWD:ALL" | \ + EDITOR=tee visudo -f /etc/sudoers.d/arrow +USER arrow + +COPY docs/requirements.txt /arrow/docs/ +RUN sudo chown -R arrow: ${ARROW_PYTHON_VENV} && \ + python3 -m venv ${ARROW_PYTHON_VENV} && \ + . ${ARROW_PYTHON_VENV}/bin/activate && \ + pip install -r arrow/docs/requirements.txt + ENV ARROW_ACERO=ON \ ARROW_AZURE=OFF \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index e28ceae8801f0..ceeab2455bef6 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -229,12 +229,17 @@ find . -name "*.o" -delete popd if [ -x "$(command -v ldconfig)" ]; then - ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} + if [ -x "$(command -v sudo)" ]; then + SUDO=sudo + else + SUDO= + fi + ${SUDO} ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} fi if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo -e "===\n=== ccache statistics after build\n===" - ccache -sv 2>/dev/null || ccache -s + echo -e "===\n=== ccache statistics after build\n===" + ccache -sv 2>/dev/null || ccache -s fi if command -v sccache &> /dev/null; then @@ -244,6 +249,6 @@ fi if [ "${BUILD_DOCS_CPP}" == "ON" ]; then pushd ${source_dir}/apidoc - doxygen + OUTPUT_DIRECTORY=${build_dir}/apidoc doxygen popd fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index a5a012ad2c5c4..2eb58e8dc75ec 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -40,6 +40,8 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then pip install jpype1 fi +export ARROW_BUILD_ROOT=${build_dir} + # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 2103f0329baec..0fa1edab429c0 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -75,7 +75,16 @@ fi # Use `2 * ncores` threads mvn="${mvn} -T 2C" -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +mkdir -p ${build_dir} +rm -rf ${build_dir}/format +cp -aL ${arrow_dir}/format ${build_dir}/ +rm -rf ${build_dir}/java +cp -aL ${source_dir} ${build_dir}/ +pushd ${build_dir}/java if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then mvn="${mvn} -Pshade-flatbuffers" @@ -95,7 +104,7 @@ if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 mkdir -p ${build_dir}/docs/java/reference ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site - rsync -a ${arrow_dir}/java/target/site/apidocs/ ${build_dir}/docs/java/reference + rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference fi popd diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh index 86ea7cf155350..0ee5d3026aa09 100755 --- a/ci/scripts/java_cdata_integration.sh +++ b/ci/scripts/java_cdata_integration.sh @@ -20,9 +20,9 @@ set -ex arrow_dir=${1} -export ARROW_SOURCE_DIR=${arrow_dir} +build_dir=${2} -pushd ${arrow_dir}/java/c/src/test/python +pushd ${build_dir}/java/c/src/test/python python integration_tests.py diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index d61f74f0b7ca1..196539ee0f101 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -25,7 +25,16 @@ build_dir=${2} : ${BUILD_DOCS_JS:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/js +mkdir -p ${build_dir} +cp -aL ${arrow_dir}/LICENSE.txt ${build_dir}/ +cp -aL ${arrow_dir}/NOTICE.txt ${build_dir}/ +cp -aL ${source_dir} ${build_dir}/js +pushd ${build_dir}/js yarn --immutable yarn lint:ci @@ -34,18 +43,18 @@ yarn build if [ "${BUILD_DOCS_JS}" == "ON" ]; then # If apache or upstream are defined use those as remote. # Otherwise use origin which could be a fork on PRs. - if [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + if [ "$(git -C ${arrow_dir} config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then yarn doc --gitRemote apache - elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then + elif [[ "$(git -C ${arrow_dir}config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then yarn doc --gitRemote upstream - elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then + elif [[ "$(basename -s .git $(git -C ${arrow_dir} config --get remote.origin.url))" == "arrow" ]]; then yarn doc else echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 fi mkdir -p ${build_dir}/docs/js - rsync -a ${arrow_dir}/js/doc/ ${build_dir}/docs/js + rsync -a doc/ ${build_dir}/docs/js fi popd diff --git a/ci/scripts/js_test.sh b/ci/scripts/js_test.sh index 40de974ede161..863b1c3d34613 100755 --- a/ci/scripts/js_test.sh +++ b/ci/scripts/js_test.sh @@ -20,8 +20,9 @@ set -ex source_dir=${1}/js +build_dir=${2}/js -pushd ${source_dir} +pushd ${build_dir} yarn lint yarn test diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 99153cdf75539..9455baf353633 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -78,17 +78,42 @@ export PYARROW_PARALLEL=${n_jobs} export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${python_build_dir} +cp -aL ${source_dir} ${python_build_dir} +pushd ${python_build_dir} # - Cannot call setup.py as it may install in the wrong directory # on Debian/Ubuntu (ARROW-15243). # - Cannot use build isolation as we want to use specific dependency versions # (e.g. Numpy, Pandas) on some CI jobs. ${PYTHON:-python} -m pip install --no-deps --no-build-isolation -vv . -# Remove build artifacts from source directory -find build/ -user root -delete popd if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then + # https://github.com/apache/arrow/issues/41429 + # TODO: We want to out-of-source build. This is a workaround. + # + # Copy docs/source because the "autosummary_generate = True" + # configuration generates files to docs/source/python/generated/. + rm -rf ${python_build_dir}/docs/source + mkdir -p ${python_build_dir}/docs + cp -a ${arrow_dir}/docs/source ${python_build_dir}/docs/ + rm -rf ${python_build_dir}/format + cp -a ${arrow_dir}/format ${python_build_dir}/ + rm -rf ${python_build_dir}/cpp/examples + mkdir -p ${python_build_dir}/cpp + cp -a ${arrow_dir}/cpp/examples ${python_build_dir}/cpp/ + rm -rf ${python_build_dir}/ci + cp -a ${arrow_dir}/ci/ ${python_build_dir}/ ncpus=$(python -c "import os; print(os.cpu_count())") - sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir}/docs + export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml + pushd ${build_dir} + sphinx-build \ + -b html \ + ${python_build_dir}/docs/source \ + ${build_dir}/docs + popd fi diff --git a/ci/scripts/r_build.sh b/ci/scripts/r_build.sh index 38b54e4434036..f4dc5a5781c6e 100755 --- a/ci/scripts/r_build.sh +++ b/ci/scripts/r_build.sh @@ -24,15 +24,29 @@ build_dir=${2} : ${BUILD_DOCS_R:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/r +cp -aL ${source_dir} ${build_dir}/r +pushd ${build_dir}/r # build first so that any stray compiled files in r/src are ignored ${R_BIN} CMD build . -${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz +if [ -x "$(command -v sudo)" ]; then + SUDO=sudo +else + SUDO= +fi +${SUDO} \ + env \ + PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} \ + ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz if [ "${BUILD_DOCS_R}" == "ON" ]; then ${R_BIN} -e "pkgdown::build_site(install = FALSE)" - rsync -a ${source_dir}/docs/ ${build_dir}/docs/r + rsync -a docs/ ${build_dir}/docs/r fi popd diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 7376bb0a3b72d..cb831060022a4 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -371,6 +371,10 @@ def run(self, service_name, command=None, *, env=None, volumes=None, v = "{}:{}".format(v['source'], v['target']) args.extend(['-v', v]) + # append capabilities from the compose conf + for c in service.get('cap_add', []): + args.extend([f'--cap-add={c}']) + # infer whether an interactive shell is desired or not if command in ['cmd.exe', 'bash', 'sh', 'powershell']: args.append('-it') diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index 8e7a0bb99f9de..ccc807410a848 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -18,17 +18,23 @@ import contextlib import functools import os +from pathlib import Path import subprocess from . import cdata from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT + + +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) def load_version_from_pom(): import xml.etree.ElementTree as ET - tree = ET.parse(os.path.join(ARROW_ROOT_DEFAULT, 'java', 'pom.xml')) + tree = ET.parse(os.path.join(ARROW_BUILD_ROOT, 'java', 'pom.xml')) tag_pattern = '{http://maven.apache.org/POM/4.0.0}version' version_tag = list(tree.getroot().findall(tag_pattern))[0] return version_tag.text @@ -48,7 +54,7 @@ def load_version_from_pom(): _ARROW_TOOLS_JAR = os.environ.get( "ARROW_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/tools/target", f"arrow-tools-{_arrow_version}-jar-with-dependencies.jar" ) @@ -56,7 +62,7 @@ def load_version_from_pom(): _ARROW_C_DATA_JAR = os.environ.get( "ARROW_C_DATA_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/c/target", f"arrow-c-data-{_arrow_version}.jar" ) @@ -64,7 +70,7 @@ def load_version_from_pom(): _ARROW_FLIGHT_JAR = os.environ.get( "ARROW_FLIGHT_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/flight/flight-integration-tests/target", f"flight-integration-tests-{_arrow_version}-jar-with-dependencies.jar" ) diff --git a/dev/archery/archery/integration/tester_js.py b/dev/archery/archery/integration/tester_js.py index c7f363ba54687..3d1a229931cde 100644 --- a/dev/archery/archery/integration/tester_js.py +++ b/dev/archery/archery/integration/tester_js.py @@ -16,13 +16,17 @@ # under the License. import os +from pathlib import Path from .tester import Tester from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT -ARROW_JS_ROOT = os.path.join(ARROW_ROOT_DEFAULT, 'js') +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) +ARROW_JS_ROOT = os.path.join(ARROW_BUILD_ROOT, 'js') _EXE_PATH = os.path.join(ARROW_JS_ROOT, 'bin') _VALIDATE = os.path.join(_EXE_PATH, 'integration.ts') _JSON_TO_ARROW = os.path.join(_EXE_PATH, 'json-to-arrow.ts') diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 126b0fcb6f76a..146fa52fa958b 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -65,7 +65,7 @@ groups: - r-binary-packages - ubuntu-* - wheel-* - - test-ubuntu-*-docs + - test-debian-*-docs {############################# Testing tasks #################################} @@ -1458,15 +1458,15 @@ tasks: {% endfor %} # be sure to update binary-task.rb when upgrading ubuntu - test-ubuntu-22.04-docs: + test-debian-12-docs: ci: github template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: false artifacts: - docs.tar.gz @@ -1594,8 +1594,8 @@ tasks: template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: true diff --git a/docker-compose.yml b/docker-compose.yml index d771fc2d22a35..9bedb59a77be8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -131,7 +131,8 @@ x-hierarchy: - debian-cpp: - debian-c-glib: - debian-ruby - - debian-python + - debian-python: + - debian-docs - debian-go: - debian-go-cgo - debian-go-cgo-python @@ -145,8 +146,7 @@ x-hierarchy: - ubuntu-c-glib: - ubuntu-ruby - ubuntu-lint - - ubuntu-python: - - ubuntu-docs + - ubuntu-python - ubuntu-python-sdist-test - ubuntu-r - ubuntu-r-only-r @@ -1228,6 +1228,8 @@ services: # We should extend the list of enabled rules after adding this build to # the CI pipeline. image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS} + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_SUBSTRAIT: "ON" @@ -1378,7 +1380,7 @@ services: /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/java_jni_build.sh /arrow $${ARROW_HOME} /build /tmp/dist/java/ && /arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java && - /arrow/ci/scripts/java_cdata_integration.sh /arrow /tmp/dist/java" ] + /arrow/ci/scripts/java_cdata_integration.sh /arrow /build" ] conda-python-cython2: # Usage: @@ -1680,7 +1682,7 @@ services: command: &js-command > /bin/bash -c " /arrow/ci/scripts/js_build.sh /arrow /build && - /arrow/ci/scripts/js_test.sh /arrow" + /arrow/ci/scripts/js_test.sh /arrow /build" #################################### C# ##################################### @@ -1759,29 +1761,34 @@ services: ################################ Docs ####################################### - ubuntu-docs: + debian-docs: # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python - # docker-compose build ubuntu-docs - # docker-compose run --rm ubuntu-docs - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + # docker-compose build debian-cpp + # docker-compose build debian-python + # docker-compose build debian-docs + # docker-compose run --rm debian-docs + image: ${REPO}:${ARCH}-debian-${DEBIAN}-docs build: context: . dockerfile: ci/docker/linux-apt-docs.dockerfile cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + - ${REPO}:${ARCH}-debian-${DEBIAN}-docs args: r: ${R} jdk: ${JDK} maven: ${MAVEN} node: ${NODE} - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 + base: ${REPO}:${ARCH}-debian-${DEBIAN}-python-3 + # This is for Chromium used by Mermaid. Chromium uses namespace + # isolation for security by default. + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_CUDA: "ON" ARROW_CXX_FLAGS_DEBUG: "-g1" ARROW_C_FLAGS_DEBUG: "-g1" + ARROW_HOME: "/tmp/local" ARROW_JAVA_SKIP_GIT_PLUGIN: ARROW_SUBSTRAIT: "ON" BUILD_DOCS_C_GLIB: "ON" @@ -1790,9 +1797,11 @@ services: BUILD_DOCS_JS: "ON" BUILD_DOCS_PYTHON: "ON" BUILD_DOCS_R: "ON" - volumes: *ubuntu-volumes - command: &docs-command > + volumes: *debian-volumes + command: > /bin/bash -c " + sudo mkdir -p /build /ccache && + sudo chown -R `id --user --name`: /build /ccache && /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/c_glib_build.sh /arrow /build && diff --git a/docs/requirements.txt b/docs/requirements.txt index 8891680814dff..afb252e17457b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,8 +8,9 @@ myst-parser[linkify] numpydoc pydata-sphinx-theme~=0.14 sphinx-autobuild -sphinx-design sphinx-copybutton +sphinx-design sphinx-lint +sphinxcontrib-mermaid sphinx==6.2 pandas diff --git a/docs/source/conf.py b/docs/source/conf.py index 05340dc923c89..b487200555a09 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -125,6 +125,7 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', + 'sphinxcontrib.mermaid', ] # Show members for classes in .. autosummary @@ -137,7 +138,9 @@ } # Breathe configuration -breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} +breathe_projects = { + "arrow_cpp": os.environ.get("ARROW_CPP_DOXYGEN_XML", "../../cpp/apidoc/xml"), +} breathe_default_project = "arrow_cpp" # Overridden conditionally below @@ -584,6 +587,9 @@ # # texinfo_no_detailmenu = False +# -- Options for mermaid output ------------------------------------------- + +mermaid_output_format = 'svg' def setup(app): # Use a config value to indicate whether CUDA API docs can be generated. diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index 7ee84952b4350..c65a1f70bde7f 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -68,9 +68,8 @@ Downloading Data A client that wishes to download the data would: -.. figure:: ./Flight/DoGet.mmd.svg - - Retrieving data via ``DoGet``. +.. mermaid:: ./Flight/DoGet.mmd + :caption: Retrieving data via ``DoGet``. #. Construct or acquire a ``FlightDescriptor`` for the data set they are interested in. @@ -168,9 +167,8 @@ data. However, ``GetFlightInfo`` doesn't return until the query completes, so the client is blocked. In this situation, the client can use ``PollFlightInfo`` instead of ``GetFlightInfo``: -.. figure:: ./Flight/PollFlightInfo.mmd.svg - - Polling a long-running query by ``PollFlightInfo``. +.. mermaid:: ./Flight/PollFlightInfo.mmd + :caption: Polling a long-running query by ``PollFlightInfo``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``PollFlightInfo(FlightDescriptor)`` to get a ``PollInfo`` @@ -229,9 +227,8 @@ Uploading Data To upload data, a client would: -.. figure:: ./Flight/DoPut.mmd.svg - - Uploading data via ``DoPut``. +.. mermaid:: ./Flight/DoPut.mmd + :caption: Uploading data via ``DoPut``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoPut(FlightData)`` and upload a stream of Arrow record @@ -257,9 +254,8 @@ require being stateful if implemented using ``DoGet`` and ``DoPut``. Instead, ``DoExchange`` allows this to be implemented as a single call. A client would: -.. figure:: ./Flight/DoExchange.mmd.svg - - Complex data flow with ``DoExchange``. +.. mermaid:: ./Flight/DoExchange.mmd + :caption: Complex data flow with ``DoExchange``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoExchange(FlightData)``. diff --git a/docs/source/format/Flight/DoExchange.mmd b/docs/source/format/Flight/DoExchange.mmd index 14f1789aeaaa7..f7586bf35eb4f 100644 --- a/docs/source/format/Flight/DoExchange.mmd +++ b/docs/source/format/Flight/DoExchange.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoExchange.mmd.svg b/docs/source/format/Flight/DoExchange.mmd.svg deleted file mode 100644 index 204d63d77218d..0000000000000 --- a/docs/source/format/Flight/DoExchange.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoExchange(FlightData)1stream of FlightData2stream of FlightData3par[[Client sends data]][[Server sends data]]ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/DoGet.mmd b/docs/source/format/Flight/DoGet.mmd index c2e3cd034448c..cac59afb8219f 100644 --- a/docs/source/format/Flight/DoGet.mmd +++ b/docs/source/format/Flight/DoGet.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoGet.mmd.svg b/docs/source/format/Flight/DoGet.mmd.svg deleted file mode 100644 index 48a50d77ed33f..0000000000000 --- a/docs/source/format/Flight/DoGet.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerGetFlightInfo(FlightDescriptor)1FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}2This may be parallelizedDoGet(Ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/Flight/DoPut.mmd b/docs/source/format/Flight/DoPut.mmd index 5845edef1f466..876505da2d300 100644 --- a/docs/source/format/Flight/DoPut.mmd +++ b/docs/source/format/Flight/DoPut.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoPut.mmd.svg b/docs/source/format/Flight/DoPut.mmd.svg deleted file mode 100644 index 9e490e152bdb3..0000000000000 --- a/docs/source/format/Flight/DoPut.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoPut(FlightData)1stream of FlightData2PutResult{app_metadata}3ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/PollFlightInfo.mmd b/docs/source/format/Flight/PollFlightInfo.mmd index d062a3a216958..f91c077b655c0 100644 --- a/docs/source/format/Flight/PollFlightInfo.mmd +++ b/docs/source/format/Flight/PollFlightInfo.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd):/data minlag/mermaid-cli -i /data/PollFlightInfo.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/PollFlightInfo.mmd.svg b/docs/source/format/Flight/PollFlightInfo.mmd.svg deleted file mode 100644 index 1890361f88ce4..0000000000000 --- a/docs/source/format/Flight/PollFlightInfo.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerThis may be parallelizedSome endpoints may be processed while pollingloop[for each endpoint in FlightInfo.endpoints]PollFlightInfo(FlightDescriptor)1PollInfo{descriptor: FlightDescriptor', ...}2PollFlightInfo(FlightDescriptor')3PollInfo{descriptor: FlightDescriptor'', ...}4PollFlightInfo(FlightDescriptor'')5PollInfo{descriptor: null, info: FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}6DoGet(Ticket)7stream of FlightData8ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index 1a43e4bdff306..181efce286e70 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -242,21 +242,17 @@ Close and invalidate the current session context. Sequence Diagrams ================= -.. figure:: ./FlightSql/CommandGetTables.mmd.svg +.. mermaid:: ./FlightSql/CommandGetTables.mmd + :caption: Listing available tables. - Listing available tables. +.. mermaid:: ./FlightSql/CommandStatementQuery.mmd + :caption: Executing an ad-hoc query. -.. figure:: ./FlightSql/CommandStatementQuery.mmd.svg +.. mermaid:: ./FlightSql/CommandPreparedStatementQuery.mmd + :caption: Creating a prepared statement, then executing it. - Executing an ad-hoc query. - -.. figure:: ./FlightSql/CommandPreparedStatementQuery.mmd.svg - - Creating a prepared statement, then executing it. - -.. figure:: ./FlightSql/CommandStatementIngest.mmd.svg - - Executing a bulk ingestion. +.. mermaid:: ./FlightSql/CommandStatementIngest.mmd + :caption: Executing a bulk ingestion. External Resources ================== diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd b/docs/source/format/FlightSql/CommandGetTables.mmd index f151411647f23..e6b18ed7dc08b 100644 --- a/docs/source/format/FlightSql/CommandGetTables.mmd +++ b/docs/source/format/FlightSql/CommandGetTables.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd.svg b/docs/source/format/FlightSql/CommandGetTables.mmd.svg deleted file mode 100644 index 4e71c01982289..0000000000000 --- a/docs/source/format/FlightSql/CommandGetTables.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandGetTables)1FlightInfo{…Ticket…}2DoGet(Ticket)3stream of FlightData4ClientServer \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd index cbd1eb6014bca..ce18b91eaa33e 100644 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandPreparedStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg deleted file mode 100644 index cbf6a78e9a5ce..0000000000000 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientoptional response with updated handleloop[for each endpoint in FlightInfo.endpoints]loop[for each invocation of the prepared statement]DoAction(ActionCreatePreparedStatementRequest)1ActionCreatePreparedStatementResult{handle}2DoPut(CommandPreparedStatementQuery)3stream of FlightData4DoPutPreparedStatementResult{handle}5GetFlightInfo(CommandPreparedStatementQuery)6FlightInfo{endpoints: [FlightEndpoint{…}, …]}7DoGet(endpoint.ticket)8stream of FlightData9DoAction(ActionClosePreparedStatementRequest)10ActionClosePreparedStatementRequest{}11 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd b/docs/source/format/FlightSql/CommandStatementIngest.mmd index 781289d77b41a..0578f465d4dda 100644 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd +++ b/docs/source/format/FlightSql/CommandStatementIngest.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg b/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg deleted file mode 100644 index e2aa72459afa5..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientDoPut(CommandStatementIngest)1stream of FlightData2PutResult{DoPutUpdateResult{RecordCount: int64}}3 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd b/docs/source/format/FlightSql/CommandStatementQuery.mmd index 7b67fecfb75c6..f26aa2f951fcf 100644 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg deleted file mode 100644 index f5e8c79f137ff..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandStatementQuery)1FlightInfo{endpoints: [FlightEndpoint{…}, …]}2DoGet(endpoint.ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientServer \ No newline at end of file From 071ffaf2633eb58540a872514507ab362cc26fb4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 May 2024 00:22:02 +0200 Subject: [PATCH 062/105] GH-41256: [Format][Docs] Add a canonical extension type specification for JSON (#41257) ### Rationale for this change As per #41256 this proposes a specification of a canonical extension type for JSON serialized data. ### What changes are included in this PR? This adds to documentation of canonical extension types. ### Are these changes tested? No as only docs are changed. ### Are there any user-facing changes? No. * GitHub Issue: #41256 Lead-authored-by: Rok Mihevc Co-authored-by: Will Jones Co-authored-by: Antoine Pitrou Signed-off-by: Rok Mihevc --- docs/source/format/CanonicalExtensions.rst | 25 +++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 1f055b7f8edb5..47c161c14cafc 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -51,7 +51,7 @@ types: 3) Its serialization *must* be described in the proposal and should not require unduly implementation work or unusual software dependencies - (for example, a trivial custom text format or JSON would be acceptable). + (for example, a trivial custom text format or a JSON-based format would be acceptable). 4) Its expected semantics *should* be described as well and any potential ambiguities or pain points addressed or at least mentioned. @@ -251,6 +251,27 @@ Variable shape tensor Values inside each **data** tensor element are stored in row-major/C-contiguous order according to the corresponding **shape**. +.. _json_extension: + +JSON +==== + +* Extension name: ``arrow.json``. + +* The storage type of this extension is ``String`` or + or ``LargeString`` or ``StringView``. + Only UTF-8 encoded JSON as specified in `rfc8259`_ is supported. + +* Extension type parameters: + + This type does not have any parameters. + +* Description of the serialization: + + Metadata is either an empty string or a JSON string with an empty object. + In the future, additional fields may be added, but they are not required + to interpret the array. + ========================= Community Extension Types ========================= @@ -268,3 +289,5 @@ GeoArrow Arrow extension types for representing vector geometries. It is well known within the Arrow geospatial subcommunity. The GeoArrow specification is not yet finalized. + +.. _rfc8259: https://datatracker.ietf.org/doc/html/rfc8259 From c5be02703312f01186ceea2d910a93e5421e3c83 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 May 2024 00:46:29 +0200 Subject: [PATCH 063/105] GH-41298: [Format][Docs] Add a canonical extension type specification for UUID (#41299) ### Rationale for this change Several users have expressed a need for a UUID type. This is to provide a canonical UUID extension type specification. ### What changes are included in this PR? This adds to documentation of canonical extension types. ### Are these changes tested? No as only docs are changed. ### Are there any user-facing changes? No. * GitHub Issue: #41298 Lead-authored-by: Rok Mihevc Co-authored-by: Antoine Pitrou Signed-off-by: Rok Mihevc --- docs/source/format/CanonicalExtensions.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 47c161c14cafc..c60f095dd354d 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -272,6 +272,17 @@ JSON In the future, additional fields may be added, but they are not required to interpret the array. +UUID +==== + +* Extension name: ``arrow.uuid``. + +* The storage type of the extension is ``FixedSizeBinary`` with a length of 16 bytes. + +.. note:: + A specific UUID version is not required or guaranteed. This extension represents + UUIDs as FixedSizeBinary(16) with big-endian notation and does not interpret the bytes in any way. + ========================= Community Extension Types ========================= From 7bfe02db04e34fc1ab6df6f647a76899e0c654db Mon Sep 17 00:00:00 2001 From: David Schlosnagle Date: Wed, 8 May 2024 19:46:15 -0400 Subject: [PATCH 064/105] GH-41573: [Java] VectorSchemaRoot uses inefficient stream to copy fieldVectors (#41574) ### Rationale for this change While reviewing allocation profiling of an Arrow intensive application, I noticed significant allocations due to `ArrayList#grow()` originating from `org.apache.arrow.vector.VectorSchemaRoot#getFieldVectors()`. The `org.apache.arrow.vector.VectorSchemaRoot#getFieldVectors()` method uses an inefficient `fieldVectors.stream().collect(Collectors.toList())` to create a list copy, leading to reallocations as the target list is collected. This could be replaced with a more efficent `new ArrayList<>(fieldVectors)` to make a pre-sized list copy, or even better an unmodifiable view via `Collections.unmodifiableList(fieldVectors)`. ### What changes are included in this PR? * Use `Collections.unmodifiableList(List)` to return unmodifiable list view of `fieldVectors` from `getFieldVectors()` * Pre-size the `fieldVectors` `ArrayList` in static factory `VectorSchemaRoot#create(Schema, BufferAllocator)` * `VectorSchemaRoot#setRowCount(int)` iterates over instance `fieldVectors` instead of copied list (similar to existing `allocateNew()`, `clear()`, `contentToTSVString()`). ### Are these changes tested? These changes are covered by existing unit and integration tests. ### Are there any user-facing changes? No * GitHub Issue: #41573 Authored-by: David Schlosnagle Signed-off-by: David Li --- .../java/org/apache/arrow/vector/VectorSchemaRoot.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java index 8768a90c80b83..9a92ce5060b1b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -121,7 +122,7 @@ public VectorSchemaRoot(Schema schema, List fieldVectors, int rowCo * Creates a new set of empty vectors corresponding to the given schema. */ public static VectorSchemaRoot create(Schema schema, BufferAllocator allocator) { - List fieldVectors = new ArrayList<>(); + List fieldVectors = new ArrayList<>(schema.getFields().size()); for (Field field : schema.getFields()) { FieldVector vector = field.createVector(allocator); fieldVectors.add(vector); @@ -160,7 +161,7 @@ public void clear() { } public List getFieldVectors() { - return fieldVectors.stream().collect(Collectors.toList()); + return Collections.unmodifiableList(fieldVectors); } /** @@ -236,7 +237,7 @@ public int getRowCount() { */ public void setRowCount(int rowCount) { this.rowCount = rowCount; - for (FieldVector v : getFieldVectors()) { + for (FieldVector v : fieldVectors) { v.setValueCount(rowCount); } } From f6127a6d18af12ce18a0b8b1eac02346721cc399 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 9 May 2024 04:58:59 +0200 Subject: [PATCH 065/105] GH-41356: [Release][Docs] Update post release documentation task to remove the warnings banner for stable version (#41377) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change With every release dev documentation is moved to `docs/` and becomes stable version of the documentation but the version warnings banner is still present. ### What changes are included in this PR? This PR removes the banner before the dev docs are copied to the `docs/` folder. ### Are these changes tested? Not yet. ### Are there any user-facing changes? No. * GitHub Issue: #41356 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/release/post-08-docs.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh index c59f9b96857a6..58a462551f199 100755 --- a/dev/release/post-08-docs.sh +++ b/dev/release/post-08-docs.sh @@ -72,13 +72,28 @@ fi # delete current stable docs and restore all previous versioned docs rm -rf docs/* git checkout "${versioned_paths[@]}" +# Download and untar released docs in a temp folder +rm -rf docs_new +mkdir docs_new +pushd docs_new curl \ --fail \ --location \ --remote-name \ https://apache.jfrog.io/artifactory/arrow/docs/${version}/docs.tar.gz tar xvf docs.tar.gz -rm -f docs.tar.gz +# Update DOCUMENTATION_OPTIONS.show_version_warning_banner +find docs \ + -type f \ + -exec \ + sed -i.bak \ + -e "s/DOCUMENTATION_OPTIONS.show_version_warning_banner = true/DOCUMENTATION_OPTIONS.show_version_warning_banner = false/g" \ + {} \; +find ./ -name '*.bak' -delete +popd +mv docs_new/docs/* docs/ +rm -rf docs_new + if [ "$is_major_release" = "yes" ] ; then previous_series=${previous_version%.*} mv docs_temp docs/${previous_series} From bd444106af494b3d4c6cce0af88f6ce2a6a327eb Mon Sep 17 00:00:00 2001 From: Tom McTiernan Date: Thu, 9 May 2024 20:15:43 +0100 Subject: [PATCH 066/105] GH-39645: [Python] Fix read_table for encrypted parquet (#39438) ### Rationale for this change Currently, if you try to read a decrypted parquet with read_table, passing decryption_properties - in the happy path (pyarrow.data available for import) the reading/decryption of the file fails, as the decryption properties are missing. ### What changes are included in this PR? Pass through the argument that was intended to have been passed. ### Are these changes tested? We have tested this locally on an encrypted parquet dataset - please advise on any further testing you would like beyond that and the standard CI. ### Are there any user-facing changes? Not in any cases where their code was previously working? The intended behaviour for encrypted dataset decryption should start working. * Closes: #39645 Lead-authored-by: Tom McTiernan Co-authored-by: Don Co-authored-by: Rok Mihevc Signed-off-by: Rok Mihevc --- python/pyarrow/_dataset_parquet.pxd | 1 + python/pyarrow/_dataset_parquet.pyx | 30 ++- .../pyarrow/_dataset_parquet_encryption.pyx | 8 + python/pyarrow/parquet/core.py | 5 +- .../pyarrow/tests/parquet/test_encryption.py | 180 +++++++++--------- .../pyarrow/tests/test_dataset_encryption.py | 12 ++ 6 files changed, 142 insertions(+), 94 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pxd b/python/pyarrow/_dataset_parquet.pxd index d5bc172d324d5..0a3a2ff526ea4 100644 --- a/python/pyarrow/_dataset_parquet.pxd +++ b/python/pyarrow/_dataset_parquet.pxd @@ -29,6 +29,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cdef: CParquetFragmentScanOptions* parquet_options object _parquet_decryption_config + object _decryption_properties cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp) cdef CReaderProperties* reader_properties(self) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index a55e889ba8246..4942336a12666 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -56,7 +56,7 @@ from pyarrow._parquet cimport ( try: from pyarrow._dataset_parquet_encryption import ( - set_encryption_config, set_decryption_config + set_encryption_config, set_decryption_config, set_decryption_properties ) parquet_encryption_enabled = True except ImportError: @@ -127,8 +127,7 @@ cdef class ParquetFileFormat(FileFormat): 'instance of ParquetReadOptions') if default_fragment_scan_options is None: - default_fragment_scan_options = ParquetFragmentScanOptions( - **scan_args) + default_fragment_scan_options = ParquetFragmentScanOptions(**scan_args) elif isinstance(default_fragment_scan_options, dict): default_fragment_scan_options = ParquetFragmentScanOptions( **default_fragment_scan_options) @@ -715,6 +714,9 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None If not None, use the provided ParquetDecryptionConfig to decrypt the Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. """ @@ -729,6 +731,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_string_size_limit=None, thrift_container_size_limit=None, decryption_config=None, + decryption_properties=None, bint page_checksum_verification=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) @@ -743,6 +746,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): self.thrift_container_size_limit = thrift_container_size_limit if decryption_config is not None: self.parquet_decryption_config = decryption_config + if decryption_properties is not None: + self.decryption_properties = decryption_properties self.page_checksum_verification = page_checksum_verification cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): @@ -812,6 +817,25 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): raise ValueError("size must be larger than zero") self.reader_properties().set_thrift_container_size_limit(size) + @property + def decryption_properties(self): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Unable to access encryption features. " + "Encryption is not enabled in your installation of pyarrow." + ) + return self._decryption_properties + + @decryption_properties.setter + def decryption_properties(self, config): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Encryption is not enabled in your installation of pyarrow, but " + "decryption_properties were provided." + ) + set_decryption_properties(self, config) + self._decryption_properties = config + @property def parquet_decryption_config(self): if not parquet_encryption_enabled: diff --git a/python/pyarrow/_dataset_parquet_encryption.pyx b/python/pyarrow/_dataset_parquet_encryption.pyx index 11a7174eb3c9d..c8f5e5b01bf81 100644 --- a/python/pyarrow/_dataset_parquet_encryption.pyx +++ b/python/pyarrow/_dataset_parquet_encryption.pyx @@ -162,6 +162,14 @@ def set_encryption_config( opts.parquet_options.parquet_encryption_config = c_config +def set_decryption_properties( + ParquetFragmentScanOptions opts not None, + FileDecryptionProperties config not None +): + cdef CReaderProperties* reader_props = opts.reader_properties() + reader_props.file_decryption_properties(config.unwrap()) + + def set_decryption_config( ParquetFragmentScanOptions opts not None, ParquetDecryptionConfig config not None diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 69a1c9d19aae2..f54a203c8794c 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1299,7 +1299,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, f"local file systems, not {type(filesystem)}" ) - # check for single fragment dataset + # check for single fragment dataset or dataset directory single_file = None self._base_dir = None if not isinstance(path_or_paths, list): @@ -1313,8 +1313,6 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, except ValueError: filesystem = LocalFileSystem(use_mmap=memory_map) finfo = filesystem.get_file_info(path_or_paths) - if finfo.is_file: - single_file = path_or_paths if finfo.type == FileType.Directory: self._base_dir = path_or_paths else: @@ -1771,6 +1769,7 @@ def read_table(source, *, columns=None, use_threads=True, ignore_prefixes=ignore_prefixes, pre_buffer=pre_buffer, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, + decryption_properties=decryption_properties, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index edb6410d2fa0d..ff388ef506997 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -65,6 +65,44 @@ def basic_encryption_config(): return basic_encryption_config +def setup_encryption_environment(custom_kms_conf): + """ + Sets up and returns the KMS connection configuration and crypto factory + based on provided KMS configuration parameters. + """ + kms_connection_config = pe.KmsConnectionConfig(custom_kms_conf=custom_kms_conf) + + def kms_factory(kms_connection_configuration): + return InMemoryKmsClient(kms_connection_configuration) + + # Create our CryptoFactory + crypto_factory = pe.CryptoFactory(kms_factory) + + return kms_connection_config, crypto_factory + + +def write_encrypted_file(path, data_table, footer_key_name, col_key_name, + footer_key, col_key, encryption_config): + """ + Writes an encrypted parquet file based on the provided parameters. + """ + # Setup the custom KMS configuration with provided keys + custom_kms_conf = { + footer_key_name: footer_key.decode("UTF-8"), + col_key_name: col_key.decode("UTF-8"), + } + + # Setup encryption environment + kms_connection_config, crypto_factory = setup_encryption_environment( + custom_kms_conf) + + # Write the encrypted parquet file + write_encrypted_parquet(path, data_table, encryption_config, + kms_connection_config, crypto_factory) + + return kms_connection_config, crypto_factory + + def test_encrypted_parquet_write_read(tempdir, data_table): """Write an encrypted parquet, verify it's encrypted, and then read it.""" path = tempdir / PARQUET_NAME @@ -81,20 +119,10 @@ def test_encrypted_parquet_write_read(tempdir, data_table): cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + encryption_config) - crypto_factory = pe.CryptoFactory(kms_factory) - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) # Read with decryption properties @@ -150,36 +178,22 @@ def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table): cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) + write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, + FOOTER_KEY, COL_KEY, encryption_config) - crypto_factory = pe.CryptoFactory(kms_factory) - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) - # Read with decryption properties - wrong_kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - # Wrong keys - mixup in names - FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), - COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - } - ) + wrong_kms_connection_config, wrong_crypto_factory = setup_encryption_environment({ + FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), # Intentionally wrong + COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), # Intentionally wrong + }) + decryption_config = pe.DecryptionConfiguration( cache_lifetime=timedelta(minutes=5.0)) with pytest.raises(ValueError, match=r"Incorrect master key used"): read_encrypted_parquet( path, decryption_config, wrong_kms_connection_config, - crypto_factory) + wrong_crypto_factory) def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table): @@ -219,23 +233,12 @@ def test_encrypted_parquet_write_no_col_key(tempdir, data_table): encryption_config = pe.EncryptionConfiguration( footer_key=FOOTER_KEY_NAME) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - crypto_factory = pe.CryptoFactory(kms_factory) with pytest.raises(OSError, match="Either column_keys or uniform_encryption " "must be set"): # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) + write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, + FOOTER_KEY, b"", encryption_config) def test_encrypted_parquet_write_kms_error(tempdir, data_table, @@ -497,24 +500,11 @@ def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config): # Encrypt the footer with the footer key, # encrypt column `a` and column `b` with another key, - # keep `c` plaintext - encryption_config = basic_encryption_config + # keep `c` plaintext, defined in basic_encryption_config + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - crypto_factory = pe.CryptoFactory(kms_factory) - - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) decryption_config = pe.DecryptionConfiguration( @@ -537,32 +527,46 @@ def test_read_with_deleted_crypto_factory(tempdir, data_table, basic_encryption_ Test that decryption properties can be used if the crypto factory is no longer alive """ path = tempdir / PARQUET_NAME - encryption_config = basic_encryption_config - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - encryption_crypto_factory = pe.CryptoFactory(kms_factory) - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, encryption_crypto_factory) + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) verify_file_encrypted(path) - # Use a local function to get decryption properties, so the crypto factory that - # creates the properties will be deleted after it returns. - def get_decryption_properties(): - decryption_crypto_factory = pe.CryptoFactory(kms_factory) - decryption_config = pe.DecryptionConfiguration( - cache_lifetime=timedelta(minutes=5.0)) - return decryption_crypto_factory.file_decryption_properties( - kms_connection_config, decryption_config) + # Create decryption properties and delete the crypto factory that created + # the properties afterwards. + decryption_config = pe.DecryptionConfiguration( + cache_lifetime=timedelta(minutes=5.0)) + file_decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + del crypto_factory result = pq.ParquetFile( - path, decryption_properties=get_decryption_properties()) + path, decryption_properties=file_decryption_properties) result_table = result.read(use_threads=True) assert data_table.equals(result_table) + + +def test_encrypted_parquet_read_table(tempdir, data_table, basic_encryption_config): + """Write an encrypted parquet then read it back using read_table.""" + path = tempdir / PARQUET_NAME + + # Write the encrypted parquet file using the utility function + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) + + decryption_config = pe.DecryptionConfiguration( + cache_lifetime=timedelta(minutes=5.0)) + file_decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + + # Read the encrypted parquet file using read_table + result_table = pq.read_table(path, decryption_properties=file_decryption_properties) + + # Assert that the read table matches the original data + assert data_table.equals(result_table) + + # Read the encrypted parquet folder using read_table + result_table = pq.read_table( + tempdir, decryption_properties=file_decryption_properties) + assert data_table.equals(result_table) diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index 2a631db9fc0fa..0d8b4a152ab9f 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -142,6 +142,18 @@ def test_dataset_encryption_decryption(): assert table.equals(dataset.to_table()) + # set decryption properties for parquet fragment scan options + decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + pq_scan_opts = ds.ParquetFragmentScanOptions( + decryption_properties=decryption_properties + ) + + pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) + dataset = ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) + + assert table.equals(dataset.to_table()) + @pytest.mark.skipif( not encryption_unavailable, reason="Parquet Encryption is currently enabled" From 1c62df5255ced89171d5b846dc82f5a10d519f4a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 9 May 2024 15:17:20 -0400 Subject: [PATCH 067/105] GH-41179: [Docs] Documentation for Dissociated IPC Protocol (#41180) ### Rationale for this change Adding documentation for the Experimental Dissociated IPC Protocol for splitting arrow IPC metadata and body data into separate streams for use with newer transports such as UCX and Libfabric. The relevant mailing list discussion and vote: https://lists.apache.org/thread/k26n1h90b1wy1w5k53whh0t8o4nd0yx7 ### What changes are included in this PR? Only documentation changes and images for the Arrow docs site. I tagged people I thought might be relevant for reviewing, but feel free to tag and add anyone else that might seem relevant to reviewing this. Thanks! * GitHub Issue: #41179 Lead-authored-by: Matt Topol Co-authored-by: Benjamin Kietzman Co-authored-by: Sutou Kouhei Co-authored-by: Antoine Pitrou Signed-off-by: Matt Topol --- docs/source/format/Columnar.rst | 2 + docs/source/format/DissociatedIPC.rst | 403 ++++++++++++++++++ .../format/DissociatedIPC/ClientFlowchart.mmd | 37 ++ .../DissociatedIPC/SequenceDiagramSame.mmd | 43 ++ .../SequenceDiagramSeparate.mmd | 44 ++ docs/source/format/Flight.rst | 2 + docs/source/format/index.rst | 1 + 7 files changed, 532 insertions(+) create mode 100644 docs/source/format/DissociatedIPC.rst create mode 100644 docs/source/format/DissociatedIPC/ClientFlowchart.mmd create mode 100644 docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd create mode 100644 docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 0cfece2586294..ec6a7fa5e334a 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1108,6 +1108,8 @@ includes a serialized Flatbuffer type along with an optional message body. We define this message format before describing how to serialize each constituent IPC message type. +.. _ipc-message-format: + Encapsulated message format --------------------------- diff --git a/docs/source/format/DissociatedIPC.rst b/docs/source/format/DissociatedIPC.rst new file mode 100644 index 0000000000000..0b0861399cb2f --- /dev/null +++ b/docs/source/format/DissociatedIPC.rst @@ -0,0 +1,403 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _dissociated-ipc: + +======================== +Dissociated IPC Protocol +======================== + +.. warning:: + + Experimental: The Dissociated IPC Protocol is experimental in its current + form. Based on feedback and usage the protocol definition may change until + it is fully standardized. + +Rationale +========= + +The :ref:`Arrow IPC format ` describes a protocol for transferring +Arrow data as a stream of record batches. This protocol expects a continuous +stream of bytes divided into discrete messages (using a length prefix and +continuation indicator). Each discrete message consists of two portions: + +* A `Flatbuffers`_ header message +* A series of bytes consisting of the flattened and packed body buffers (some + message types, like Schema messages, do not have this section) + - This is referred to as the *message body* in the IPC format spec. + +For most cases, the existing IPC format as it currently exists is sufficiently efficient: + +* Receiving data in the IPC format allows zero-copy utilization of the body + buffer bytes, no deserialization is required to form Arrow Arrays +* An IPC file format can be memory-mapped because it is location agnostic + and the bytes of the file are exactly what is expected in memory. + +However, there are use cases that aren't handled by this: + +* Constructing the IPC record batch message requires allocating a contiguous + chunk of bytes and copying all of the data buffers into it, packed together + back-to-back. This pessimizes the common case of wrapping existing, directly + consumable data into an IPC message. +* Even if Arrow data is located in a memory accessible across process boundaries + or transports (such as UCX), there is no standard way to specify that shared + location to consumers which could take advantage of it. +* Arrow data located on a non-CPU device (such as a GPU) cannot be sent using + Arrow IPC without having to copy the data back to the host device or copying + the Flatbuffers metadata bytes into device memory. + + * By the same token, receiving IPC messages into device memory would require + performing a copy of the Flatbuffers metadata back to the host CPU device. This + is due to the fact that the IPC stream interleaves data and metadata across a + single stream. + +This protocol attempts to solve these use cases in an efficient manner. + +Goals +----- + +* Define a generic protocol for passing Arrow IPC data, not tied to any particular + transport, that also allows for utilizing non-CPU device memory, shared memory, and + newer "high performance" transports such as `UCX`_ or `libfabric`_. + + * This allows for the data in the body to be kept on non-CPU devices (like GPUs) + without expensive device-to-host copies. + +* Allow for using :ref:`Flight RPC ` purely for control flow by separating + the stream of IPC metadata from IPC body bytes + +Definitions +----------- + +IPC Metadata + The Flatbuffers message bytes that encompass the header of an Arrow IPC message + +Tag + A little-endian ``uint64`` value used for flow control and used in determining + how to interpret the body of a message. Specific bits can be masked to allow + identifying messages by only a portion of the tag, leaving the rest of the bits + to be used for control flow or other message metadata. Some transports, such as + UCX, have built-in support for such tag values and will provide them in CPU + memory regardless of whether or not the body of the message may reside on a + non-CPU device. + +Sequence Number + A little-endian, 4-byte unsigned integer starting at 0 for a stream, indicating + the sequence order of messages. It is also used to identify specific messages to + tie the IPC metadata header to its corresponding body since the metadata and body + can be sent across separate pipes/streams/transports. + + If a sequence number reaches ``UINT32_MAX``, it should be allowed to roll over as + it is unlikely there would be enough unprocessed messages waiting to be processed + that would cause an overlap of sequence numbers. + + The sequence number serves two purposes: To identify corresponding metadata and + tagged body data messages and to ensure we do not rely on messages having to arrive + in order. A client should use the sequence number to correctly order messages as + they arrive for processing. + +The Protocol +============ + +A reference example implementation utilizing `libcudf`_ and `UCX`_ can be found in the +`arrow-experiments repo `_. + +Requirements +------------ + +A transport implementing this protocol **MUST** provide two pieces of functionality: + +* Message sending + + * Delimited messages (like gRPC) as opposed to non-delimited streams (like plain TCP + without further framing). + + * Alternatively, a framing mechanism like the :ref:`encapsulated message format ` + for the IPC protocol can be used while leaving out the body bytes. + +* Tagged message sending + + * Sending a message that has an attached little-endian, unsigned 64-bit integral tag + for control flow. A tag like this allows control flow to operate on a message whose body + is on a non-CPU device without requiring the message itself to get copied off of the device. + +URI Specification +----------------- + +When providing a URI to a consumer to contact for use with this protocol (such as via +the :ref:`Location URI for Flight `), the URI should specify a scheme +like *ucx:* or *fabric:*, that is easily identifiable. In addition, the URI should +encode the following URI query parameters: + +.. note:: + As this protocol matures, this document will get updated with commonly recognized + transport schemes that get used with it. + +* ``want_data`` - **REQUIRED** - uint64 integer value + + * This value should be used to tag an initial message to the server to initiate a + data transfer. The body of the initiating message should be an opaque binary identifier + of the data stream being requested (like the ``Ticket`` in the Flight RPC protocol) + +* ``free_data`` - **OPTIONAL** - uint64 integer value + + * If the server might send messages using offsets / addresses for remote memory accessing + or shared memory locations, the URI should include this parameter. This value is used to + tag messages sent from the client to the data server, containing specific offsets / addresses + which were provided that are no longer required by the client (i.e. any operations that + directly reference those memory locations, such as copying the remote data into local memory, + have been completed). + +* ``remote_handle`` - **OPTIONAL** - base64-encoded string + + * When working with shared memory or remote memory, this value indicates any required + handle or identifier that is necessary for accessing the memory. + + * Using UCX, this would be an *rkey* value + + * With CUDA IPC, this would be the value of the base GPU pointer or memory handle, + and subsequent addresses would be offsets from this base pointer. + +Handling of Backpressure +------------------------ + +*Currently* this proposal does not specify any way to manage the backpressure of +messages to throttle for memory and bandwidth reasons. For now, this will be +**transport-defined** rather than lock into something sub-optimal. + +As usage among different transports and libraries grows, common patterns will emerge +that will allow for a generic, but efficient, way to handle backpressure across +different use cases. + +.. note:: + While the protocol itself is transport agnostic, the current usage and examples + only have been tested using UCX and libfabric transports so far, but that's all. + + +Protocol Description +==================== + +There are two possibilities that can occur: + +1. The streams of metadata and body data are sent across separate connections + +.. mermaid:: ./DissociatedIPC/SequenceDiagramSeparate.mmd + + +2. The streams of metadata and body data are sent simultaneously across the + same connection + +.. mermaid:: ./DissociatedIPC/SequenceDiagramSame.mmd + +Server Sequence +--------------- + +There can be either a single server handling both the IPC Metadata stream and the +Body data streams, or separate servers for handling the IPC Metadata and the body +data. This allows for streaming of data across either a single transport pipe or +two pipes if desired. + +Metadata Stream Sequence +'''''''''''''''''''''''' + +The standing state of the server is waiting for a **tagged** message with a specific +```` tag value to initiate a transfer. This ```` value is defined +by the server and propagated to any clients via the URI they are provided. This protocol +does not prescribe any particular value so that it will not interfere with any other +existing protocols that rely on tag values. The body of that message will contain an +opaque, binary identifier to indicate a particular dataset / data stream to send. + +.. note:: + + For instance, the **ticket** that was passed with a *FlightInfo* message would be + the body of this message. Because it is opaque, it can be anything the server wants + to use. The URI and identifier do not need to be given to the client via Flight RPC, + but could come across from any transport or protocol desired. + +Upon receiving a ```` request, the server *should* respond by sending a stream +of messages consisting of the following: + +.. mermaid:: + + block-beta + columns 8 + + block:P["\n\n\n\nPrefix"]:5 + T["Message type\nByte 0"] + S["Sequence number\nBytes 1-4"] + end + H["Flatbuffer bytes\nRest of the message"]:3 + +* A 5-byte prefix + + - The first byte of the message indicates the type of message, currently there are only + two allowed message types (more types may get added in the future): + + 0) End of Stream + 1) Flatbuffers IPC Metadata Message + + - the next 4-bytes are a little-endian, unsigned 32-bit integer indicating the sequence number of + the message. The first message in the stream (**MUST** always be a schema message) **MUST** + have a sequence number of ``0``. Each subsequent message **MUST** increment the number by + ``1``. + +* The full Flatbuffers bytes of an Arrow IPC header + +As defined in the Arrow IPC format, each metadata message can represent a chunk of data or +dictionaries for use by the stream of data. + +After sending the last metadata message, the server **MUST** indicate the end of the stream +by sending a message consisting of **exactly** 5 bytes: + +* The first byte is ``0``, indicating an **End of Stream** message +* The last 4 bytes are the sequence number (4-byte, unsigned integer in little-endian byte order) + +Data Stream Sequence +'''''''''''''''''''' + +If a single server is handling both the data and metadata streams, then the data messages +**should** begin being sent to the client in parallel with the metadata messages. Otherwise, +as with the metadata sequence, the standing state of the server is to wait for a **tagged** +message with the ```` tag value, whose body indicates the dataset / data stream +to send to the client. + +For each IPC message in the stream of data, a **tagged** message **MUST** be sent on the data +stream if that message has a body (i.e. a Record Batch or Dictionary message). The +:term:`tag ` for each message should be structured as follows: + +.. mermaid:: + + block-beta + columns 8 + + S["Sequence number\nBytes 0-3"]:4 + U["Unused (Reserved)\nBytes 4-6"]:3 + T["Message type\nByte 7"]:1 + +* The *least significant* 4-bytes (bits 0 - 31) of the tag should be the unsigned 32-bit, little-endian sequence + number of the message. +* The *most significant* byte (bits 56 - 63) of the tag indicates the message body **type** as an 8-bit + unsigned integer. Currently only two message types are specified, but more can be added as + needed to expand the protocol: + + 0) The body contains the raw body buffer bytes as a packed buffer (i.e. the standard IPC + format body bytes) + 1) The body contains a series of unsigned, little-endian 64-bit integer pairs to represent + either shared or remote memory, schematically structured as + + * The first two integers (e.g. the first 16 bytes) represent the *total* size (in bytes) + of all buffers and the number of buffers in this message (and thus the number of following + pairs of ``uint64``) + + * Each subsequent pair of ``uint64`` values are an address / offset followed the length of + that particular buffer. + +* All unspecified bits (bits 32 - 55) of the tag are *reserved* for future use by potential updates + to this protocol. For now they **MUST** be 0. + +.. note:: + + Any shared/remote memory addresses that are sent across **MUST** be kept alive by the server + until a corresponding tagged ```` message is received. If the client disconnects + before sending any ```` messages, it can be assumed to be safe to clean up the memory + if desired by the server. + +After sending the last tagged IPC body message, the server should maintain the connection and wait +for tagged ```` messages. The structure of these ```` messages is simple: +one or more unsigned, little-endian 64-bit integers which indicate the addresses/offsets that can +be freed. + +Once there are no more outstanding addresses to be freed, the work for this stream is complete. + +Client Sequence +--------------- + +A client for this protocol needs to concurrently handle both the data and metadata streams of +messages which may either both come from the same server or different servers. Below is a flowchart +showing how a client might handle the metadata and data streams: + +.. mermaid:: ./DissociatedIPC/ClientFlowchart.mmd + +#. First the client sends a tagged message using the ```` value it was provided in the + URI as the tag, and the opaque ID as the body. + + * If the metadata and data servers are separate, then a ```` message needs to be sent + separately to each. + * In either scenario, the metadata and data streams can be processed concurrently and/or asynchronously + depending on the nature of the transports. + +#. For each **untagged** message the client receives in the metadata stream: + + * The first byte of the message indicates whether it is an *End of Stream* message (value ``0``) + or a metadata message (value ``1``). + * The next 4 bytes are the sequence number of the message, an unsigned 32-bit integer in + little-endian byte order. + * If it is **not** an *End of Stream* message, the remaining bytes are the IPC Flatbuffer bytes which + can be interpreted as normal. + + * If the message has a body (i.e. Record Batch or Dictionary message) then the client should retrieve + a tagged message from the Data Stream using the same sequence number. + + * If it **is** an *End of Stream* message, then it is safe to close the metadata connection if there are + no gaps in the sequence numbers received. + +#. When a metadata message that requires a body is received, the tag mask of ``0x00000000FFFFFFFF`` **should** + be used alongside the sequence number to match the message regardless of the higher bytes (e.g. we only + care about matching the lower 4 bytes to the sequence number) + + * Once recieved, the Most Significant Byte's value determines how the client processes the body data: + + * If the most significant byte is 0: Then the body of the message is the raw IPC packed body buffers + allowing it to easily be processed with the corresponding metadata header bytes. + + * If the most significant byte is 1: The body of the message will consist of a series of pairs of + unsigned, 64-bit integers in little-endian byte order. + + * The first two integers represent *1)* the total size of all the body buffers together to allow + for easy allocation if an intermediate buffer is needed and *2)* the number of buffers being sent (``nbuf``). + + * The rest of the message will be ``nbuf`` pairs of integers, one for each buffer. Each pair is + *1)* the address / offset of the buffer and *2)* the length of that buffer. Memory can then be retrieved + via shared or remote memory routines based on the underlying transport. These addresses / offsets **MUST** + be retained so they can be sent back in ```` messages later, indicating to the server that + the client no longer needs the shared memory. + +#. Once an *End of Stream* message is received, the client should process any remaining un-processed + IPC metadata messages. + +#. After individual memory addresses / offsets are able to be freed by the remote server (in the case where + it has sent these rather than the full body bytes), the client should send corresponding ```` messages + to the server. + + * A single ```` message consists of an arbitrary number of unsigned 64-bit integer values, representing + the addresses / offsets which can be freed. The reason for it being an *arbitrary number* is to allow a client + to choose whether to send multiple messages to free multiple addresses or to coalesce multiple addresses into + fewer messages to be freed (thus making the protocol less "chatty" if desired) + +Continuing Development +====================== + +If you decide to try this protocol in your own environments and system, we'd love feedback and to learn about +your use case. As this is an **experimental** protocol currently, we need real-world usage in order to facilitate +improving it and finding the right generalizations to standardize on across transports. + +Please chime in using the Arrow Developers Mailing list: https://arrow.apache.org/community/#mailing-lists + +.. _Flatbuffers: http://github.com/google/flatbuffers +.. _UCX: https://openucx.org/ +.. _libfabric: https://ofiwg.github.io/libfabric/ +.. _libcudf: https://docs.rapids.ai/api diff --git a/docs/source/format/DissociatedIPC/ClientFlowchart.mmd b/docs/source/format/DissociatedIPC/ClientFlowchart.mmd new file mode 100644 index 0000000000000..652cabc1c7425 --- /dev/null +++ b/docs/source/format/DissociatedIPC/ClientFlowchart.mmd @@ -0,0 +1,37 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at + +%% http://www.apache.org/licenses/LICENSE-2.0 + +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +graph LR +client((Client))-->c1{{Send #60;want_data#gt; Msg}} +subgraph meta [Meta Message] + direction LR + m1[/Msg Type #40;byte 0#41;
Seq Num #40;bytes 1-5#41;/]-- type 1 -->m2[[Process IPC Header]] + m2-- IPC has body -->m3[Get Corresponding
Tagged Msg] + m2-- Schema Msg -->m4[/Store Schema/] + m1-- type 0 -->e[Indicate End of Stream] +end +subgraph data [Data Stream] + direction LR + d1[Request Msg
for Seq Num]-->d2{Most Significant
Byte} + d2-- 0 -->d3[Construct from
Metadata and Body] + d2-- 1 -->d4[Get shared/remote
buffers] + d4 -->d5[Construct from
Metadata and buffers] + d3 & d5 -->e2[Output Batch] +end + +client -- recv untagged msg --> meta +client -- get tagged msg --> data diff --git a/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd b/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd new file mode 100644 index 0000000000000..adf26bdc32767 --- /dev/null +++ b/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd @@ -0,0 +1,43 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +sequenceDiagram + participant C as Client + participant S as Server + activate C + C-->>+S: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + S-->>C: Message(bytes([1]) + le_bytes(sequence_number) + schema_metadata) + par + loop each chunk + S-->>C: Message(bytes([1]) + le_bytes(sequence_number) + batch_metadata) + end + S-->>C: Message(bytes([0]) + le_bytes(sequence_number)) + and + loop each chunk + alt + S-->>C: TaggedMessage((bytes[0] << 55) | le_bytes(sequence_number),
bytes=batch_data) + else + S-->>C: TaggedMessage((bytes[1] << 55) | le_bytes(sequence_number),
bytes=uint64_pairs) + end + end + end + + loop + C-->>S: TaggedMessage(server.free_data, bytes=uint64_list) + end + deactivate S + deactivate C diff --git a/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd b/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd new file mode 100644 index 0000000000000..11d2d9d6387eb --- /dev/null +++ b/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd @@ -0,0 +1,44 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +sequenceDiagram + participant D as Data Stream + participant C as Client + participant M as Metadata Stream + + activate C + C-->>+M: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + C-->>+D: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + M-->>C: Message(bytes([1]) + le_bytes(sequence_number) + schema_metadata) + loop each batch + par + M-->>C: Message(bytes([1]) + le_bytes(sequence_number) + batch_metadata) + and + alt + D-->>C: TaggedMessage((bytes[0] << 55) | le_bytes(sequence_number),
bytes=batch_data) + else + D-->>C: TaggedMessage((bytes[1] << 55) | le_bytes(sequence_number),
bytes=uint64_pairs) + end + end + end + M-->>C: Message(bytes([0]) + le_bytes(sequence_number)) + deactivate M + loop + C-->>D: TaggedMessage(server.free_data, bytes=uint64_list) + end + deactivate D + deactivate C diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index c65a1f70bde7f..2c5487d857ea4 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -310,6 +310,8 @@ well, in which case any `authentication method supported by gRPC .. _Mutual TLS (mTLS): https://grpc.io/docs/guides/auth/#supported-auth-mechanisms +.. _flight-location-uris: + Location URIs ============= diff --git a/docs/source/format/index.rst b/docs/source/format/index.rst index 856830d863243..44ea3e8e7e608 100644 --- a/docs/source/format/index.rst +++ b/docs/source/format/index.rst @@ -30,6 +30,7 @@ Specifications CDataInterface CStreamInterface CDeviceDataInterface + DissociatedIPC Flight FlightSql ADBC From 5255adc5139d6094a7b3b04f273f3ef11d49ec38 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Thu, 9 May 2024 21:26:16 -0400 Subject: [PATCH 068/105] GH-41529: [C++][Compute] Remove redundant logic for ArrayData as ExecResults in ExecScalarCaseWhen (#41380) ### Rationale for this change Remove useless path in `ExecScalarCaseWhen` ### What changes are included in this PR? Refactor : remove processing logic for ArrayData as ExecResults in ExecScalarCaseWhen. ### Are these changes tested? Yes, by exists. ### Are there any user-facing changes? No * GitHub Issue: #41529 Authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Felipe Oliveira Carvalho --- .../arrow/compute/kernels/scalar_if_else.cc | 50 +++++++------------ 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 13874d9d65e70..6368ef525ff9c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1483,39 +1483,27 @@ Status ExecScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch, ExecResult* result = temp.get(); } - // TODO(wesm): clean this up to have less duplication - if (out->is_array_data()) { - ArrayData* output = out->array_data().get(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - if (dict_from.is_scalar()) { - output->dictionary = checked_cast(*dict_from.scalar) - .value.dictionary->data(); - } else { - output->dictionary = dict_from.array.ToArrayData()->dictionary; - } - } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetMutableValues(0, 0), - output->GetMutableValues(1, 0), output->offset); - } else { - // ArraySpan - ArraySpan* output = out->array_span_mutable(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - output->child_data.resize(1); - if (dict_from.is_scalar()) { - output->child_data[0].SetMembers( - *checked_cast(*dict_from.scalar) - .value.dictionary->data()); - } else { - output->child_data[0] = dict_from.array; - } + // Only input types of non-fixed length (which cannot be pre-allocated) + // will save the output data in ArrayData. And make sure the FixedLength + // types must be output in ArraySpan. + static_assert(is_fixed_width(Type::type_id)); + DCHECK(out->is_array_span()); + + ArraySpan* output = out->array_span_mutable(); + if (is_dictionary_type::value) { + const ExecValue& dict_from = has_result ? result : batch[1]; + output->child_data.resize(1); + if (dict_from.is_scalar()) { + output->child_data[0].SetMembers( + *checked_cast(*dict_from.scalar) + .value.dictionary->data()); + } else { + output->child_data[0] = dict_from.array; } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetValues(0, 0), output->GetValues(1, 0), - output->offset); } + CopyValues(result, /*in_offset=*/0, batch.length, + output->GetValues(0, 0), output->GetValues(1, 0), + output->offset); return Status::OK(); } From f8d3b10b4b89b47f6e7a594b95c82e2ff161f1a5 Mon Sep 17 00:00:00 2001 From: Tai Le Manh <49281946+tlm365@users.noreply.github.com> Date: Fri, 10 May 2024 12:42:25 +0700 Subject: [PATCH 069/105] GH-41590: [Java] Improve BaseRepeatedValueVector function on isEmpty and isNull operations (#41601) ### Rationale for this change Resolves #41590 . ### What changes are included in this PR? Make `abstract` on `isNull` and `isEmpty` of `BaseRepeatedValueVector`. ### Are these changes tested? Existing tests pass. ### Are there any user-facing changes? No. * GitHub Issue: #41590 Authored-by: Tai Le Manh Signed-off-by: David Li --- .../vector/complex/BaseRepeatedValueVector.java | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 7906d90c2fff0..7c4015299a6cd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -355,16 +355,8 @@ public int getInnerValueCountAt(int index) { offsetBuffer.getInt(index * OFFSET_WIDTH); } - /** Return if value at index is null (this implementation is always false). */ - @Override - public boolean isNull(int index) { - return false; - } - - /** Return if value at index is empty (this implementation is always false). */ - public boolean isEmpty(int index) { - return false; - } + /** Return if value at index is empty. */ + public abstract boolean isEmpty(int index); /** Starts a new repeated value. */ public int startNewValue(int index) { From 7aea8bf7a65d679bd71d973b358f997eb3b6c6af Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Fri, 10 May 2024 23:58:10 +0900 Subject: [PATCH 070/105] GH-41316: [CI][Python] Reduce CI time on macOS (#41378) ### Rationale for this change Reduce CI time for python build on macos-12 and macos-14 using ccache ### What changes are included in this PR? Add ccache for macos-12 and macos-14 ### Are these changes tested? Check the cache hit rate ### Are there any user-facing changes? No * GitHub Issue: #41316 Authored-by: Hyunseok Seo Signed-off-by: Sutou Kouhei --- .github/workflows/python.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 15056961f8cf4..a568f8346e7fc 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -182,6 +182,19 @@ jobs: python -m pip install \ -r python/requirements-build.txt \ -r python/requirements-test.txt + - name: Setup ccache + shell: bash + run: ci/scripts/ccache_setup.sh + - name: ccache info + id: ccache-info + shell: bash + run: echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ${{ steps.ccache-info.outputs.cache-dir }} + key: python-ccache-macos-${{ matrix.macos-version }}-${{ hashFiles('cpp/**', 'python/**') }} + restore-keys: python-ccache-macos-${{ matrix.macos-version }}- - name: Build shell: bash run: | From 899422e16e3f1f71819f52fc627359d79f7d3662 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 16:42:21 -0300 Subject: [PATCH 071/105] GH-39301: [Archery][CI][Integration] Add nanoarrow to archery + integration setup (#39302) ### Rationale for this change The ability to add integration testing was added in nanoarrow however, the infrastructure for running these tests currently lives in the arrow monorepo. ### What changes are included in this PR? - Added the relevant code to Archery such that these tests can be run - Added the relevant scripts/environment variables to CI such that these tests run in the integration CI job ### Are these changes tested? Yes, via the "Integration" CI job. ### Are there any user-facing changes? No. This PR still needs https://github.com/apache/arrow/pull/41264 for the integration tests to pass. * Closes: #39301 * GitHub Issue: #39301 Lead-authored-by: Dewey Dunnington Co-authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- .github/workflows/integration.yml | 6 + ci/scripts/integration_arrow_build.sh | 2 + ci/scripts/nanoarrow_build.sh | 52 ++++++ dev/archery/archery/cli.py | 5 +- dev/archery/archery/integration/datagen.py | 3 + dev/archery/archery/integration/runner.py | 8 +- .../archery/integration/tester_nanoarrow.py | 148 ++++++++++++++++++ docker-compose.yml | 2 + 8 files changed, 223 insertions(+), 3 deletions(-) create mode 100755 ci/scripts/nanoarrow_build.sh create mode 100644 dev/archery/archery/integration/tester_nanoarrow.py diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6e09ad61480a6..f53f4aeb505d2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -75,6 +75,11 @@ jobs: with: repository: apache/arrow-rs path: rust + - name: Checkout Arrow nanoarrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + repository: apache/arrow-nanoarrow + path: nanoarrow - name: Free up disk space run: | ci/scripts/util_free_space.sh @@ -97,6 +102,7 @@ jobs: run: > archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ -e ARCHERY_INTEGRATION_WITH_RUST=1 \ conda-integration - name: Docker Push diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index e5c31527aedff..9b54049a2b803 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -30,6 +30,8 @@ build_dir=${2} ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} + if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} fi diff --git a/ci/scripts/nanoarrow_build.sh b/ci/scripts/nanoarrow_build.sh new file mode 100755 index 0000000000000..1612b9a2d0102 --- /dev/null +++ b/ci/scripts/nanoarrow_build.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_dir=${1} +source_dir=${1}/nanoarrow +build_dir=${2}/nanoarrow + +# This file is used to build the nanoarrow binaries needed for the archery +# integration tests. Testing of the nanoarrow implementation in normal CI is handled +# by github workflows in the arrow-nanoarrow repository. + +if [ "${ARCHERY_INTEGRATION_WITH_NANOARROW}" -eq "0" ]; then + echo "=====================================================================" + echo "Not building nanoarrow" + echo "=====================================================================" + exit 0; +elif [ ! -d "${source_dir}" ]; then + echo "=====================================================================" + echo "The nanoarrow source is missing. Please clone the arrow-nanoarrow repository" + echo "to arrow/nanoarrow before running the integration tests:" + echo " git clone https://github.com/apache/arrow-nanoarrow.git path/to/arrow/nanoarrow" + echo "=====================================================================" + exit 1; +fi + +set -x + +mkdir -p ${build_dir} +pushd ${build_dir} + +cmake ${source_dir} -DNANOARROW_BUILD_INTEGRATION_TESTS=ON +cmake --build . + +popd diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 8a26d9266f22d..cd746f9c4499a 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -738,6 +738,9 @@ def _set_default(opt, default): help='Include JavaScript in integration tests') @click.option('--with-go', type=bool, default=False, help='Include Go in integration tests') +@click.option('--with-nanoarrow', type=bool, default=False, + help='Include nanoarrow in integration tests', + envvar="ARCHERY_INTEGRATION_WITH_NANOARROW") @click.option('--with-rust', type=bool, default=False, help='Include Rust in integration tests', envvar="ARCHERY_INTEGRATION_WITH_RUST") @@ -776,7 +779,7 @@ def integration(with_all=False, random_seed=12345, **args): gen_path = args['write_generated_json'] - languages = ['cpp', 'csharp', 'java', 'js', 'go', 'rust'] + languages = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust'] formats = ['ipc', 'flight', 'c_data'] enabled_languages = 0 diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 5cae907a4aa71..f6302165cd5a0 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1928,17 +1928,20 @@ def _temp_path(): .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_binary_view_case() .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_list_view_case() .skip_tester('C#') # Doesn't support large list views .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_extension_case() diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 5b66842b25926..0ea244720cc1d 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -36,6 +36,7 @@ from .tester_java import JavaTester from .tester_js import JSTester from .tester_csharp import CSharpTester +from .tester_nanoarrow import NanoarrowTester from .util import guid, printer from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC from ..utils.source import ARROW_ROOT_DEFAULT @@ -541,8 +542,8 @@ def get_static_json_files(): def run_all_tests(with_cpp=True, with_java=True, with_js=True, with_csharp=True, with_go=True, with_rust=False, - run_ipc=False, run_flight=False, run_c_data=False, - tempdir=None, **kwargs): + with_nanoarrow=False, run_ipc=False, run_flight=False, + run_c_data=False, tempdir=None, **kwargs): tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') testers: List[Tester] = [] @@ -562,6 +563,9 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, if with_go: testers.append(GoTester(**kwargs)) + if with_nanoarrow: + testers.append(NanoarrowTester(**kwargs)) + if with_rust: testers.append(RustTester(**kwargs)) diff --git a/dev/archery/archery/integration/tester_nanoarrow.py b/dev/archery/archery/integration/tester_nanoarrow.py new file mode 100644 index 0000000000000..30ff1bb6e50a7 --- /dev/null +++ b/dev/archery/archery/integration/tester_nanoarrow.py @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import functools +import os + +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter +from ..utils.source import ARROW_ROOT_DEFAULT + + +_NANOARROW_PATH = os.environ.get( + "ARROW_NANOARROW_PATH", + os.path.join(ARROW_ROOT_DEFAULT, "nanoarrow/cdata"), +) + +_INTEGRATION_DLL = os.path.join( + _NANOARROW_PATH, "libnanoarrow_c_data_integration" + cdata.dll_suffix +) + + +class NanoarrowTester(Tester): + PRODUCER = False + CONSUMER = False + FLIGHT_SERVER = False + FLIGHT_CLIENT = False + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True + + name = "nanoarrow" + + def validate(self, json_path, arrow_path, quirks=None): + raise NotImplementedError() + + def json_to_file(self, json_path, arrow_path): + raise NotImplementedError() + + def stream_to_file(self, stream_path, file_path): + raise NotImplementedError() + + def file_to_stream(self, file_path, stream_path): + raise NotImplementedError() + + def make_c_data_exporter(self): + return NanoarrowCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return NanoarrowCDataImporter(self.debug, self.args) + + +_nanoarrow_c_data_entrypoints = """ + const char* nanoarrow_CDataIntegration_ExportSchemaFromJson( + const char* json_path, struct ArrowSchema* out); + + const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + const char* json_path, struct ArrowSchema* schema); + + const char* nanoarrow_CDataIntegration_ExportBatchFromJson( + const char* json_path, int num_batch, struct ArrowArray* out); + + const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + const char* json_path, int num_batch, struct ArrowArray* batch); + + int64_t nanoarrow_BytesAllocated(void); + """ + + +@functools.lru_cache +def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + ffi.cdef(_nanoarrow_c_data_entrypoints) + dll = ffi.dlopen(lib_path) + return dll + + +class _CDataBase: + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + self.dll = _load_ffi(self.ffi) + + def _check_nanoarrow_error(self, na_error): + """ + Check a `const char*` error return from an integration entrypoint. + + A null means success, a non-empty string is an error message. + The string is statically allocated on the nanoarrow side and does not + need to be released. + """ + assert self.ffi.typeof(na_error) is self.ffi.typeof("const char*") + if na_error != self.ffi.NULL: + error = self.ffi.string(na_error).decode("utf8", errors="replace") + raise RuntimeError(f"nanoarrow C Data Integration call failed: {error}") + + +class NanoarrowCDataExporter(CDataExporter, _CDataBase): + def export_schema_from_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportSchemaFromJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportBatchFromJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True + + def record_allocation_state(self): + return self.dll.nanoarrow_BytesAllocated() + + +class NanoarrowCDataImporter(CDataImporter, _CDataBase): + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True diff --git a/docker-compose.yml b/docker-compose.yml index 9bedb59a77be8..7a4d455dfe723 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1751,9 +1751,11 @@ services: volumes: *conda-volumes environment: <<: [*common, *ccache] + ARCHERY_INTEGRATION_WITH_NANOARROW: 0 ARCHERY_INTEGRATION_WITH_RUST: 0 # Tell Archery where Arrow binaries are located ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_NANOARROW_PATH: /build/nanoarrow ARROW_RUST_EXE_PATH: /build/rust/debug command: ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build && From 1e3772cac5f45edb6ada3d20140b77cc86208346 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 11 May 2024 12:41:57 +0800 Subject: [PATCH 072/105] GH-41343: [C++][CMake] Remove unused ARROW_NO_DEPRECATED_API (#41345) ### Rationale for this change ARROW_NO_DEPRECATED_API is not used in the source code. ### What changes are included in this PR? Remove ARROW_NO_DEPRECATED_API cmake variable. ### Are these changes tested? Pass CIs. ### Are there any user-facing changes? Perhaps yes, users used to set ARROW_NO_DEPRECATED_API to build Arrow will see a warning for unknown CMake variable. * GitHub Issue: #41343 Authored-by: Gang Wu Signed-off-by: Gang Wu --- ci/docker/debian-12-cpp.dockerfile | 1 - ci/docker/linux-apt-r.dockerfile | 1 - ci/docker/ubuntu-20.04-cpp-minimal.dockerfile | 1 - ci/docker/ubuntu-20.04-cpp.dockerfile | 1 - ci/docker/ubuntu-22.04-cpp-minimal.dockerfile | 1 - ci/docker/ubuntu-22.04-cpp.dockerfile | 1 - ci/docker/ubuntu-24.04-cpp.dockerfile | 1 - ci/scripts/c_glib_build.sh | 3 --- ci/scripts/cpp_build.sh | 1 - cpp/CMakeLists.txt | 4 ---- cpp/cmake_modules/DefineOptions.cmake | 2 -- dev/conbench_envs/benchmarks.env | 1 - docs/source/developers/cpp/building.rst | 7 ++++--- 13 files changed, 4 insertions(+), 21 deletions(-) diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index 7036ddf27d52a..d7a6f9df2c2ee 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -119,7 +119,6 @@ ENV ARROW_ACERO=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=/usr/local \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index a68354e3abf8d..630b96e1007b9 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -113,7 +113,6 @@ ENV \ ARROW_GANDIVA=OFF \ ARROW_HDFS=OFF \ ARROW_JSON=ON \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=OFF \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index ae2ba9421cd55..e17c0306f115d 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 124256378b287..d78c7a99cf4d6 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -158,7 +158,6 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index dd887a6d00ceb..341d8a87e8661 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index eb189841cd344..f12e7456add8e 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -196,7 +196,6 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 4a37818f94396..ecfb5e2f5096d 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -178,7 +178,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/scripts/c_glib_build.sh b/ci/scripts/c_glib_build.sh index c4d2c4fdb5617..6a6295e4ff0bd 100755 --- a/ci/scripts/c_glib_build.sh +++ b/ci/scripts/c_glib_build.sh @@ -30,9 +30,6 @@ with_doc=$([ "${BUILD_DOCS_C_GLIB}" == "ON" ] && echo "true" || echo "false") export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig -export CFLAGS="-DARROW_NO_DEPRECATED_API" -export CXXFLAGS="-DARROW_NO_DEPRECATED_API" - mkdir -p ${build_dir} # Build with Meson diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index ceeab2455bef6..a1f40fc360e2f 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -152,7 +152,6 @@ else -DARROW_JSON=${ARROW_JSON:-ON} \ -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ - -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ -DARROW_ORC=${ARROW_ORC:-OFF} \ -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index df83f56dd2f70..679842c31e0b1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -431,10 +431,6 @@ endif() # Compiler flags # -if(ARROW_NO_DEPRECATED_API) - add_definitions(-DARROW_NO_DEPRECATED_API) -endif() - if(ARROW_EXTRA_ERROR_CONTEXT) add_definitions(-DARROW_EXTRA_ERROR_CONTEXT) endif() diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index dc0e5da63adb7..41466a1c22404 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -158,8 +158,6 @@ if(ARROW_DEFINE_OPTIONS) define_option_string(ARROW_GIT_DESCRIPTION "The Arrow git commit description (if any)" "") - define_option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) - define_option(ARROW_POSITION_INDEPENDENT_CODE "Whether to create position-independent target" ON) diff --git a/dev/conbench_envs/benchmarks.env b/dev/conbench_envs/benchmarks.env index 2a5a9c32a86ec..3af29491a8345 100644 --- a/dev/conbench_envs/benchmarks.env +++ b/dev/conbench_envs/benchmarks.env @@ -31,7 +31,6 @@ ARROW_HOME=$CONDA_PREFIX ARROW_INSTALL_NAME_RPATH=ON ARROW_JEMALLOC=OFF ARROW_MIMALLOC=ON -ARROW_NO_DEPRECATED_API=ON ARROW_ORC=ON ARROW_PARQUET=ON ARROW_PYTHON=ON diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 040a046c5153d..7b80d2138c33e 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -627,9 +627,10 @@ outputs like: Deprecations and API Changes ---------------------------- -We use the compiler definition ``ARROW_NO_DEPRECATED_API`` to disable APIs that -have been deprecated. It is a good practice to compile third party applications -with this flag to proactively catch and account for API changes. +We use the marco ``ARROW_DEPRECATED`` which wraps C++ deprecated attribute for +APIs that have been deprecated. It is a good practice to compile third party +applications with ``-Werror=deprecated-declarations`` (for GCC/Clang or similar +flags of other compilers) to proactively catch and account for API changes. Modular Build Targets --------------------- From a0f9d2eb2fe6743a869f3509f456389cf3de4926 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 12 May 2024 20:09:55 +0900 Subject: [PATCH 073/105] GH-41617: [C++][CMake] Fix ARROW_USE_BOOST detect condition (#41622) ### Rationale for this change We also need Boost when ARROW_FLIGHT and ARROW_TESTING are true. ### What changes are included in this PR? Add missing the condition. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #41617 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ddea1c399cbba..2102a7fdcdd37 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1263,7 +1263,7 @@ endif() # - S3FS and Flight benchmarks need Boost at runtime. if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) + OR (ARROW_FLIGHT AND (ARROW_TESTING OR ARROW_BUILD_BENCHMARKS)) OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) set(ARROW_USE_BOOST TRUE) set(ARROW_BOOST_REQUIRE_LIBRARY TRUE) From 37bd413c8322e0d9527c14420468b99f8abd1715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Sun, 12 May 2024 13:19:26 +0200 Subject: [PATCH 074/105] GH-40734: [Packaging][Debian] Drop support for Debian bullseye (#41394) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Debian GNU/Linux bullseye will reach EOL on 2024-07: https://wiki.debian.org/DebianReleases We can drop support for it after we release 16.0.0 because 17.0.0 will be released after 2024-07. ### What changes are included in this PR? Remove Debian bullseye support ### Are these changes tested? Yes on CI. ### Are there any user-facing changes? Yes, no support for Debian bullseye but no breaking changes on code. * GitHub Issue: #40734 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- c_glib/meson.build | 2 - dev/release/binary-task.rb | 3 - dev/release/verify-apt.sh | 6 -- dev/release/verify-release-candidate.sh | 4 +- .../apt/debian-bullseye/Dockerfile | 41 --------- .../apt/debian-bullseye-arm64/from | 18 ---- .../apt/debian-bullseye/Dockerfile | 87 ------------------- dev/tasks/linux-packages/package-task.rb | 2 - dev/tasks/tasks.yml | 3 +- r/tools/nixlibs.R | 4 +- 10 files changed, 3 insertions(+), 167 deletions(-) delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile diff --git a/c_glib/meson.build b/c_glib/meson.build index 16a5ea7ccb432..08a9cd182e02e 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -26,8 +26,6 @@ project('arrow-glib', 'c', 'cpp', # Debian: # https://packages.debian.org/search?keywords=meson # - # * bullseye: 0.56.2 - # * bullseye-backports:1.0.0 # * bookworm: 1.0.0 # # Ubuntu: diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb index 8fcdcf1f5f442..c2386a1f52f21 100644 --- a/dev/release/binary-task.rb +++ b/dev/release/binary-task.rb @@ -1083,7 +1083,6 @@ def apt_release_repositories_dir def available_apt_targets [ - ["debian", "bullseye", "main"], ["debian", "bookworm", "main"], ["debian", "trixie", "main"], ["ubuntu", "focal", "main"], @@ -2111,8 +2110,6 @@ def apt_test_targets_default # Disable arm64 targets by default for now # because they require some setups on host. [ - "debian-bullseye", - # "debian-bullseye-arm64", "debian-bookworm", # "debian-bookworm-arm64", "debian-trixie", diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh index cbb6d93823b21..8c54fe5c11cf1 100755 --- a/dev/release/verify-apt.sh +++ b/dev/release/verify-apt.sh @@ -80,12 +80,6 @@ esac workaround_missing_packages=() case "${distribution}-${code_name}" in - debian-bullseye) - sed \ - -i"" \ - -e "s/ main$/ main contrib non-free/g" \ - /etc/apt/sources.list - ;; debian-*) sed \ -i"" \ diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 95be4800f7ffd..3ed871bd5305b 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -185,9 +185,7 @@ test_binary() { test_apt() { show_header "Testing APT packages" - for target in "debian:bullseye" \ - "arm64v8/debian:bullseye" \ - "debian:bookworm" \ + for target in "debian:bookworm" \ "arm64v8/debian:bookworm" \ "debian:trixie" \ "arm64v8/debian:trixie" \ diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile deleted file mode 100644 index b0842a0c0d6ff..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM debian:bullseye - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from deleted file mode 100644 index 34187b2af5a74..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/debian:bullseye diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile deleted file mode 100644 index 2edcd4d5ed216..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile +++ /dev/null @@ -1,87 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=debian:bullseye -FROM ${FROM} - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo "deb http://deb.debian.org/debian bullseye-backports main" > \ - /etc/apt/sources.list.d/backports.list - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -RUN sed -i'' -e 's/main$/main contrib non-free/g' /etc/apt/sources.list - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - clang \ - cmake \ - debhelper \ - devscripts \ - git \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libgtest-dev \ - liblz4-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-dev \ - lsb-release \ - ninja-build \ - nlohmann-json3-dev \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-pip \ - rapidjson-dev \ - tzdata \ - valac \ - zlib1g-dev && \ - if apt list | grep '^nvidia-cuda-toolkit/'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - fi && \ - apt install -y -V -t bullseye-backports ${quiet} \ - meson && \ - pip3 install gi-docgen && \ - ln -fs /usr/local/bin/gi-docgen /usr/bin && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb index 3a9e5e48b4585..6bcc397277e3a 100644 --- a/dev/tasks/linux-packages/package-task.rb +++ b/dev/tasks/linux-packages/package-task.rb @@ -267,8 +267,6 @@ def apt_targets_default # Disable arm64 targets by default for now # because they require some setups on host. [ - "debian-bullseye", - # "debian-bullseye-arm64", "debian-bookworm", # "debian-bookworm-arm64", "debian-trixie", diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 146fa52fa958b..9d68e57c75dc8 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -451,8 +451,7 @@ tasks: {############################## Linux PKGS ####################################} -{% for target in ["debian-bullseye", - "debian-bookworm", +{% for target in ["debian-bookworm", "debian-trixie", "ubuntu-focal", "ubuntu-jammy", diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 0af41888b95b7..def4d35f825be 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -386,9 +386,7 @@ distro <- function() { out$id <- tolower(out$id) # debian unstable & testing lsb_release `version` don't include numbers but we can map from pretty name if (is.null(out$version) || out$version %in% c("testing", "unstable")) { - if (grepl("bullseye", out$codename)) { - out$short_version <- "11" - } else if (grepl("bookworm", out$codename)) { + if (grepl("bookworm", out$codename)) { out$short_version <- "12" } } else if (out$id == "ubuntu") { From 6d0321554374523ae0633d6bfe42cdeeb3b5d145 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sun, 12 May 2024 11:00:26 -0400 Subject: [PATCH 075/105] GH-41450: [R][CI] rhub/container follow ons (#41451) More CI changes: * GitHub Issue: #41450 (specifically use the rhub containers approach for clang sanitizer, remove some of our work arounds) * Remove CentOS 7 CI support for R Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- .env | 3 --- .github/workflows/r.yml | 3 +-- ci/docker/linux-r.dockerfile | 3 --- ci/scripts/java_jni_manylinux_build.sh | 3 --- ci/scripts/r_docker_configure.sh | 20 ---------------- ci/scripts/r_sanitize.sh | 2 ++ ci/scripts/r_test.sh | 3 --- dev/tasks/r/azure.linux.yml | 1 - dev/tasks/r/github.packages.yml | 7 +++--- dev/tasks/tasks.yml | 13 ++-------- docker-compose.yml | 16 +++++-------- r/tools/test-nixlibs.R | 4 ---- r/tools/ubsan.supp | 1 + r/vignettes/install.Rmd | 33 -------------------------- 14 files changed, 15 insertions(+), 97 deletions(-) diff --git a/.env b/.env index ab2e4b4fbe7fb..27474b2c73199 100644 --- a/.env +++ b/.env @@ -86,9 +86,6 @@ ARROW_R_DEV=TRUE R_PRUNE_DEPS=FALSE TZ=UTC -# Any non-empty string will install devtoolset-${DEVTOOLSET_VERSION} -DEVTOOLSET_VERSION= - # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8228aaad7ce37..aba77347659cd 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -192,12 +192,11 @@ jobs: fail-fast: false matrix: config: - - { org: "rhub", image: "ubuntu-gcc12", tag: "latest", devtoolset: "" } + - { org: "rhub", image: "ubuntu-gcc12", tag: "latest" } env: R_ORG: ${{ matrix.config.org }} R_IMAGE: ${{ matrix.config.image }} R_TAG: ${{ matrix.config.tag }} - DEVTOOLSET_VERSION: ${{ matrix.config.devtoolset }} steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index d368a6629c587..7b7e989adc0d1 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -27,9 +27,6 @@ ENV R_BIN=${r_bin} ARG r_dev=FALSE ENV ARROW_R_DEV=${r_dev} -ARG devtoolset_version= -ENV DEVTOOLSET_VERSION=${devtoolset_version} - ARG r_prune_deps=FALSE ENV R_PRUNE_DEPS=${r_prune_deps} diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh index da4987d307ce4..4921ce170b7a9 100755 --- a/ci/scripts/java_jni_manylinux_build.sh +++ b/ci/scripts/java_jni_manylinux_build.sh @@ -35,9 +35,6 @@ echo "=== Clear output directories and leftovers ===" rm -rf ${build_dir} echo "=== Building Arrow C++ libraries ===" -devtoolset_version=$(rpm -qa "devtoolset-*-gcc" --queryformat %{VERSION} | \ - grep -o "^[0-9]*") -devtoolset_include_cpp="/opt/rh/devtoolset-${devtoolset_version}/root/usr/include/c++/${devtoolset_version}" : ${ARROW_ACERO:=ON} export ARROW_ACERO : ${ARROW_BUILD_TESTS:=ON} diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 52db2e6df6611..8a962fe576cbb 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -67,26 +67,6 @@ sloppiness = include_file_ctime hash_dir = false" >> ~/.ccache/ccache.conf fi -# Special hacking to try to reproduce quirks on centos using non-default build -# tooling. -if [[ -n "$DEVTOOLSET_VERSION" ]]; then - $PACKAGE_MANAGER install -y centos-release-scl - $PACKAGE_MANAGER install -y "devtoolset-$DEVTOOLSET_VERSION" - - # Enable devtoolset here so that `which gcc` finds the right compiler below - source /opt/rh/devtoolset-${DEVTOOLSET_VERSION}/enable - - # Build images which require the devtoolset don't have CXX17 variables - # set as the system compiler doesn't support C++17 - if [ ! "`{R_BIN} CMD config CXX17`" ]; then - mkdir -p ~/.R - echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars - echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars - echo "CXX17STD = -std=c++17" >> ~/.R/Makevars - echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars - fi -fi - if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" ]; then "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" fi diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index f7ed07f0c864b..fb3e9a5836387 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -46,6 +46,8 @@ unset ARROW_R_DEV export ARROW_R_VERBOSE_TEST=TRUE export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" +# From the old rhub image https://github.com/r-hub/rhub-linux-builders/blob/master/fedora-clang-devel-san/Dockerfile +export ASAN_OPTIONS="alloc_dealloc_mismatch=0:detect_leaks=0:detect_odr_violation=0" # run tests pushd tests diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 95a49ee83a79b..e13da45e2d296 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -48,9 +48,6 @@ if [ "$ARROW_USE_PKG_CONFIG" != "false" ]; then fi export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} ${ARROW_R_CXXFLAGS}" -# These should generally be picked up, but are slightly wrong in rhub's containers it appears -# https://github.com/r-hub/containers/pull/63 -export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} -Wno-parentheses -Werror=format-security -Wp,-D_FORTIFY_SOURCE=3" if [ "$ARROW_R_DEV" = "TRUE" ]; then # These are sometimes used in the Arrow C++ build and are not a problem diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml index e26a59629fa1a..28893a81728c3 100644 --- a/dev/tasks/r/azure.linux.yml +++ b/dev/tasks/r/azure.linux.yml @@ -38,7 +38,6 @@ jobs: export R_ORG={{ r_org }} export R_IMAGE={{ r_image }} export R_TAG={{ r_tag }} - export DEVTOOLSET_VERSION={{ devtoolset_version|default("") }} export R_CUSTOM_CCACHE={{ r_custom_ccache|default("false") }} docker-compose pull --ignore-pull-failures r docker-compose build r diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 9ca7e59a957de..41d8b230f8bf4 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -299,14 +299,14 @@ jobs: # choosing a binary on this OS. If libarrow_binary is TRUE, we're on # an OS that is not in the allowlist, so we have to opt-in to use the # binary. Other env vars used in r_docker_configure.sh can be added - # here (like devtoolset) and wired up in the later steps. + # here and wired up in the later steps. - {image: "rhub/ubuntu-clang", libarrow_binary: "TRUE"} # fedora-clang-devel cannot use binaries bc of libc++ (uncomment to see the error) # - {image: "rhub/fedora-clang-devel", libarrow_binary: "TRUE"} - {image: "rhub/ubuntu-release"} # currently ubuntu-22.04 - {image: "rocker/r-ver:4.0.0"} # ubuntu-20.04 - - {image: "rstudio/r-base:4.1-focal"} # ubuntu-20.04 - - {image: "rstudio/r-base:4.2-centos7", devtoolset: "8"} + - {image: "rstudio/r-base:4.1-focal"} + - {image: "rstudio/r-base:4.2-jammy"} - {image: "rstudio/r-base:4.3-noble"} steps: # Get the arrow checkout just for the docker config scripts @@ -317,7 +317,6 @@ jobs: - name: Install system requirements env: ARROW_R_DEV: "TRUE" # To install curl/openssl in r_docker_configure.sh - DEVTOOLSET_VERSION: {{ '${{ matrix.config.devtoolset }}' }} shell: bash run: | # Make sure R is on the path for the R-hub devel versions (where RPREFIX is set in its dockerfile) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 9d68e57c75dc8..5bf5037652dd9 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1410,15 +1410,6 @@ tasks: GCC_VERSION: 12 image: ubuntu-r-only-r - test-r-rstudio-r-base-4.2-centos7-devtoolset-8: - ci: azure - template: r/azure.linux.yml - params: - r_org: rstudio - r_image: r-base - r_tag: 4.2-centos7 - devtoolset_version: 8 - test-r-minimal-build: ci: azure template: r/azure.linux.yml @@ -1436,13 +1427,13 @@ tasks: R_PRUNE_DEPS: TRUE image: ubuntu-r-sanitizer - test-fedora-r-clang-sanitizer: + test-r-clang-sanitizer: ci: github template: docker-tests/github.linux.yml params: env: R_PRUNE_DEPS: TRUE - image: fedora-r-clang-sanitizer + image: r-clang-sanitizer {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %} test-debian-12-go-{{ go_version }}: diff --git a/docker-compose.yml b/docker-compose.yml index 7a4d455dfe723..a1d8f60a268d8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -162,7 +162,7 @@ x-hierarchy: - ubuntu-r-valgrind - ubuntu-swift - ubuntu-verify-rc - - fedora-r-clang-sanitizer + - r-clang-sanitizer - r - r-revdepcheck # helper services @@ -1472,7 +1472,6 @@ services: args: base: ${R_ORG}/${R_IMAGE}:${R_TAG} r_dev: ${ARROW_R_DEV} - devtoolset_version: ${DEVTOOLSET_VERSION} tz: ${TZ} r_prune_deps: ${R_PRUNE_DEPS} r_custom_ccache: ${R_CUSTOM_CCACHE} @@ -1484,7 +1483,6 @@ services: ARROW_R_DEV: ${ARROW_R_DEV} # To test for CRAN release, delete ^^ these two env vars so we download the Apache release ARROW_USE_PKG_CONFIG: "false" - devtoolset_version: ${DEVTOOLSET_VERSION} volumes: - .:/arrow:delegated command: > @@ -1517,19 +1515,17 @@ services: /bin/bash -c " /arrow/ci/scripts/r_sanitize.sh /arrow" - fedora-r-clang-sanitizer: - image: ${REPO}:r-rhub-fedora-clang-devel-latest + r-clang-sanitizer: + image: ${REPO}:r-rhub-clang-devel-latest build: context: . dockerfile: ci/docker/linux-r.dockerfile cache_from: - - ${REPO}:r-rhub-fedora-clang-devel-latest + - ${REPO}:r-rhub-clang-devel-latest args: - # TODO: change this to rhub/clang-asan - base: rhub/fedora-clang-devel-san + base: rhub/clang-asan r_dev: ${ARROW_R_DEV} - devtoolset_version: ${DEVTOOLSET_VERSION} - r_bin: RDsan + r_bin: R tz: ${TZ} r_prune_deps: ${R_PRUNE_DEPS} shm_size: *shm-size diff --git a/r/tools/test-nixlibs.R b/r/tools/test-nixlibs.R index 6996f234ced45..02e822c3420c8 100644 --- a/r/tools/test-nixlibs.R +++ b/r/tools/test-nixlibs.R @@ -158,10 +158,6 @@ test_that("check_allowlist", { }) test_that("find_latest_nightly()", { - skip_if( - getRversion() > "4.4.0", - "long last version components (>8) fail to max on r-devel" - ) tf <- tempfile() tf_uri <- paste0("file://", tf) on.exit(unlink(tf)) diff --git a/r/tools/ubsan.supp b/r/tools/ubsan.supp index ff88cf984136b..34854e79bcbf9 100644 --- a/r/tools/ubsan.supp +++ b/r/tools/ubsan.supp @@ -16,3 +16,4 @@ # under the License. vptr:include/c++/8/bits/shared_ptr_base.h +function:cleancall.c \ No newline at end of file diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index cc90c5ff08c60..c7b8251ccc99b 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -28,35 +28,6 @@ For `gcc`, this generally means version 7 or newer. Most contemporary Linux distributions have a new enough compiler; however, CentOS 7 is a notable exception, as it ships with gcc 4.8. -If you are on CentOS 7, to build arrow you will need to install a newer `devtoolset`, and you'll need to update R's Makevars to define the `CXX17` variables. This script installs `devtoolset-8` and configures R to be able to use C++17: - -``` -#!/usr/bin/env bash - -yum install -y centos-release-scl -yum install -y devtoolset-8 -# Optional: also install cloud storage dependencies, as described below -yum install -y libcurl-devel openssl-devel - -source /opt/rh/devtoolset-8/enable - -if [ ! `R CMD config CXX17` ]; then - mkdir -p ~/.R - echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars - echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars - echo "CXX17STD = -std=c++17" >> ~/.R/Makevars - echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars -fi -``` - -Note that the C++17 compiler is only required at *build* time. You don't need -to enable the devtoolset every time you load the package. What's more, if you -install a binary package from RStudio Package Manager (see method 1a below), you -do not need to set up any of this. Likewise, if you `R CMD INSTALL --build` -arrow on a CentOS machine with the newer compilers, you can take the binary -package it produces and install it on any other CentOS machine without those -compilers. - ### Libraries Optional support for reading from cloud storage--AWS S3 and @@ -517,10 +488,6 @@ The install script should work everywhere, so if libarrow fails to compile, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) so that we can improve the script. -### Known installation issues - -* On CentOS, building the package requires a more modern `devtoolset` than the default system compilers. See "System dependencies" above. - ## Contributing We are constantly working to make the installation process as painless as From 9a3973c9eec639de4750dcba334711a2a3c707a6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 13 May 2024 06:45:45 +0900 Subject: [PATCH 076/105] GH-41626: [R][CI] Update OpenSUSE to 15.5 from 15.3 (#41627) ### Rationale for this change OpenSUSE 15.3 reached EOL and rstudio/r-builds dropped support for it: https://github.com/rstudio/r-builds/pull/177 ### What changes are included in this PR? Use `4.1-opensuse155` instead of `4.1-opensuse153`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41626 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/tasks/tasks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 5bf5037652dd9..d8e09ec2070bb 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1358,7 +1358,7 @@ tasks: {% for r_org, r_image, r_tag in [("rhub", "ubuntu-release", "latest"), ("rocker", "r-ver", "latest"), ("rstudio", "r-base", "4.2-focal"), - ("rstudio", "r-base", "4.1-opensuse153")] %} + ("rstudio", "r-base", "4.1-opensuse155")] %} test-r-{{ r_org }}-{{ r_image }}-{{ r_tag }}: ci: azure template: r/azure.linux.yml From 0e9896dc54ca82876171521d206ad0906535572f Mon Sep 17 00:00:00 2001 From: Tai Le Manh <49281946+tlm365@users.noreply.github.com> Date: Mon, 13 May 2024 06:54:20 +0700 Subject: [PATCH 077/105] GH-40944: [Java] Implement TypeEqualsVisitor for StringView (#41606) ### Rationale for this change Resolves #40944 . ### What changes are included in this PR? ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40944 Authored-by: Tai Le Manh Signed-off-by: David Li --- .../vector/compare/TypeEqualsVisitor.java | 2 +- .../vector/compare/TestTypeEqualsVisitor.java | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index 9bbe5c1b8997c..aaef161a563be 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -88,7 +88,7 @@ public Boolean visit(BaseLargeVariableWidthVector left, Void value) { @Override public Boolean visit(BaseVariableWidthViewVector left, Void value) { - throw new UnsupportedOperationException("View vectors are not supported."); + return compareField(left.getField(), right.getField()); } @Override diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java index 62fa0336ea925..736b0f1b1aeac 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -30,6 +31,8 @@ import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarBinaryVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; @@ -176,4 +179,42 @@ public void testDenseUnionTypeEquals() { assertFalse(typeVisitor.equals(vector1)); } } + + @Test + public void testStringViewTypeEquals() { + try (final ViewVarCharVector varchar1 = new ViewVarCharVector("varchar1", allocator); + final ViewVarCharVector varchar2 = new ViewVarCharVector("varchar2", allocator); + final ViewVarBinaryVector binary = new ViewVarBinaryVector("binary", allocator)) { + final int valueCount = 2; + final byte[] str0 = "apache".getBytes(StandardCharsets.UTF_8); + final byte[] str1 = "arrow".getBytes(StandardCharsets.UTF_8); + + // add elements for varchar1 + varchar1.allocateNew(48, valueCount); + varchar1.set(0, str0); + varchar1.set(1, str1); + varchar1.setValueCount(valueCount); + + // add elements for varchar2 in a difference order + varchar2.allocateNew(48, valueCount); + varchar2.set(0, str1); + varchar2.set(1, str0); + varchar2.setValueCount(valueCount); + + // add elements for binary + binary.allocateNew(48, valueCount); + binary.set(0, str0); + binary.set(1, str1); + binary.setValueCount(valueCount); + + // compare ignore check name + TypeEqualsVisitor visitor = new TypeEqualsVisitor(varchar1, /* check name */ false, /* check meta data */ true); + assertTrue(visitor.equals(varchar2)); + assertFalse(visitor.equals(binary)); + + // if we check names, the types should be different + visitor = new TypeEqualsVisitor(varchar1, /* check name */ true, /* check meta data */ true); + assertFalse(visitor.equals(varchar2)); + } + } } From 2552c26c9625f8f0c538a520d26c11d13ce3b48d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 13 May 2024 10:37:36 +0800 Subject: [PATCH 078/105] MINOR: [C++][ORC][CMake] Fix ORC CMake args to use LZ4_STATIC_LIB (#41632) ### Rationale for this change In the ORC_CMAKE_ARGS, we use `LZ4_STATIC_LIBRARY` but the Apache ORC library expects [`LZ4_STATIC_LIB`](https://github.com/apache/orc/blob/08aaebc371927e6bb9a0f19c7cc90478200e3b6f/cmake_modules/ThirdpartyToolchain.cmake#L313) ### What changes are included in this PR? Switch `LZ4_STATIC_LIBRARY` to `LZ4_STATIC_LIB` in the ORC_CMAKE_ARGS. ### Are these changes tested? Pass CIs. ### Are there any user-facing changes? No. Authored-by: Gang Wu Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 2102a7fdcdd37..c24442dcb8749 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4522,7 +4522,7 @@ macro(build_orc) "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" "-DSNAPPY_LIBRARY=$" "-DLZ4_LIBRARY=$" - "-DLZ4_STATIC_LIBRARY=$" + "-DLZ4_STATIC_LIB=$" "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" From a0c0ffdeac94fc3e1bfcaf0c4b0260f0b4a7717b Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Mon, 13 May 2024 18:00:15 +0900 Subject: [PATCH 079/105] GH-41587: [Docs][Python] Remove duplicate contents (#41588) ### Rationale for this change Remove duplicate contents ### What changes are included in this PR? - Remove duplicate contents - Remove `,` for consistency ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41587 Authored-by: Hyunseok Seo Signed-off-by: AlenkaF --- docs/source/python/api/arrays.rst | 4 ++-- docs/source/python/api/compute.rst | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index e6f6c3dbbd3d1..aefed00b3d2e0 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -63,8 +63,8 @@ may expose data type-specific methods or properties. FixedSizeBinaryArray LargeBinaryArray LargeStringArray - BinaryViewArray, - StringViewArray, + BinaryViewArray + StringViewArray Time32Array Time64Array Date32Array diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index ae48578a1bd61..f2ac6bd1e1226 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -540,7 +540,6 @@ Compute Options AssumeTimezoneOptions CastOptions CountOptions - CountOptions CumulativeSumOptions DayOfWeekOptions DictionaryEncodeOptions @@ -566,7 +565,6 @@ Compute Options RoundToMultipleOptions RunEndEncodeOptions ScalarAggregateOptions - ScalarAggregateOptions SelectKOptions SetLookupOptions SliceOptions @@ -578,7 +576,6 @@ Compute Options StructFieldOptions TakeOptions TDigestOptions - TDigestOptions TrimOptions VarianceOptions WeekOptions From 875e4df48e29ee76f06a4b90e8a6a0cc8f93743b Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 13 May 2024 21:52:30 +0800 Subject: [PATCH 080/105] GH-41361: [C++][Parquet] Optimize DelimitRecords by batch execution when max_rep_level > 1 (#41362) ### Rationale for this change We uses Parquet to store nested types. When doing benchmarks, the nested types spend half time on `DelimitRecords`. The flamegraph can be seen in the issue. It can be reproduced when running `parquet-column-reader-benchmark` This patch optimize DelimitRecords by batch execution. The previous code is slow because of branching. This patch changes branch to batch execution. This could be a bit slower on some scenerio, but generally it makes DelimitRecords faster. ### What changes are included in this PR? Change logic of DelimitRecords to batch execution. ### Are these changes tested? Already has tests ### Are there any user-facing changes? Yes * GitHub Issue: #41361 Lead-authored-by: mwish Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/column_reader.cc | 71 ++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index eae7ac4252735..a4794c564733a 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1675,44 +1675,55 @@ class TypedRecordReader : public TypedColumnReaderImpl, // // \return Number of records delimited int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) { - int64_t values_to_read = 0; + if (ARROW_PREDICT_FALSE(num_records == 0 || levels_position_ == levels_written_)) { + *values_seen = 0; + return 0; + } int64_t records_read = 0; - - const int16_t* def_levels = this->def_levels() + levels_position_; - const int16_t* rep_levels = this->rep_levels() + levels_position_; - + const int16_t* const rep_levels = this->rep_levels(); + const int16_t* const def_levels = this->def_levels(); ARROW_DCHECK_GT(this->max_rep_level_, 0); - - // Count logical records and number of values to read - while (levels_position_ < levels_written_) { - const int16_t rep_level = *rep_levels++; - if (rep_level == 0) { - // If at_record_start_ is true, we are seeing the start of a record - // for the second time, such as after repeated calls to - // DelimitRecords. In this case we must continue until we find - // another record start or exhausting the ColumnChunk - if (!at_record_start_) { - // We've reached the end of a record; increment the record count. - ++records_read; - if (records_read == num_records) { - // We've found the number of records we were looking for. Set - // at_record_start_ to true and break - at_record_start_ = true; - break; - } - } - } + // If at_record_start_ is true, we are seeing the start of a record + // for the second time, such as after repeated calls to + // DelimitRecords. In this case we must continue until we find + // another record start or exhausting the ColumnChunk + int64_t level = levels_position_; + if (at_record_start_) { + ARROW_DCHECK_EQ(0, rep_levels[levels_position_]); + ++levels_position_; // We have decided to consume the level at this position; therefore we // must advance until we find another record boundary at_record_start_ = false; + } - const int16_t def_level = *def_levels++; - if (def_level == this->max_def_level_) { - ++values_to_read; + // Count logical records and number of non-null values to read + ARROW_DCHECK(!at_record_start_); + // Scan repetition levels to find record end + while (levels_position_ < levels_written_) { + // We use an estimated batch size to simplify branching and + // improve performance in the common case. This might slow + // things down a bit if a single long record remains, though. + int64_t stride = + std::min(levels_written_ - levels_position_, num_records - records_read); + const int64_t position_end = levels_position_ + stride; + for (int64_t i = levels_position_; i < position_end; ++i) { + records_read += rep_levels[i] == 0; + } + levels_position_ = position_end; + if (records_read == num_records) { + // Check last rep_level reaches the boundary and + // pop the last level. + ARROW_CHECK_EQ(rep_levels[levels_position_ - 1], 0); + --levels_position_; + // We've found the number of records we were looking for. Set + // at_record_start_ to true and break + at_record_start_ = true; + break; } - ++levels_position_; } - *values_seen = values_to_read; + // Scan definition levels to find number of physical values + *values_seen = std::count(def_levels + level, def_levels + levels_position_, + this->max_def_level_); return records_read; } From a715ea06b71ec206a987d7921264778e9954404b Mon Sep 17 00:00:00 2001 From: Gavin Murrison <2135106+voidstar69@users.noreply.github.com> Date: Mon, 13 May 2024 16:38:14 +0100 Subject: [PATCH 081/105] GH-38692: [C#] Implement ICollection on scalar arrays (#41539) ### What changes are included in this PR? This PR makes the following array types support ICollection : - PrimitiveArray - BooleanArray - Date32Array - Date64Array - Time32Array - Time64Array - BinaryArray - TimestampArray - StringArray ### Are these changes tested? Yes ### Are there any user-facing changes? No Closes #38692 * GitHub Issue: #38692 Authored-by: voidstar69 Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow/Arrays/BinaryArray.cs | 27 +++- .../src/Apache.Arrow/Arrays/BooleanArray.cs | 29 +++- csharp/src/Apache.Arrow/Arrays/Date32Array.cs | 63 +++++++- csharp/src/Apache.Arrow/Arrays/Date64Array.cs | 63 +++++++- .../src/Apache.Arrow/Arrays/IntervalArray.cs | 2 +- .../src/Apache.Arrow/Arrays/PrimitiveArray.cs | 37 ++++- .../Arrays/PrimitiveArrayBuilder.cs | 2 +- csharp/src/Apache.Arrow/Arrays/StringArray.cs | 27 +++- csharp/src/Apache.Arrow/Arrays/Time32Array.cs | 27 +++- csharp/src/Apache.Arrow/Arrays/Time64Array.cs | 27 +++- .../src/Apache.Arrow/Arrays/TimestampArray.cs | 27 +++- .../src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 2 +- .../Apache.Arrow.IntegrationTest/JsonFile.cs | 6 +- .../Apache.Arrow.Tests/ArrowArrayTests.cs | 145 +++++++++++++++++- .../Apache.Arrow.Tests/Date32ArrayTests.cs | 2 +- .../Extensions/DateTimeOffsetExtensions.cs | 2 - .../Apache.Arrow.Tests/UnionArrayTests.cs | 2 +- 17 files changed, 450 insertions(+), 40 deletions(-) diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index 1bd4035d5b9da..0c84fa2be23d9 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -22,7 +22,7 @@ namespace Apache.Arrow { - public class BinaryArray : Array, IReadOnlyList + public class BinaryArray : Array, IReadOnlyList, ICollection { public class Builder : BuilderBase { @@ -380,5 +380,30 @@ IEnumerator IEnumerable.GetEnumerator() } IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(byte[]? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(byte[]? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(byte[] item) + { + for (int index = 0; index < Length; index++) + { + if (GetBytes(index).SequenceEqual(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(byte[][] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetBytes(srcIndex).ToArray(); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs index e9c5f8979e48f..19d4d0b7ed564 100644 --- a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs @@ -21,7 +21,7 @@ namespace Apache.Arrow { - public class BooleanArray: Array, IReadOnlyList + public class BooleanArray: Array, IReadOnlyList, ICollection { public class Builder : IArrowArrayBuilder { @@ -188,7 +188,7 @@ public bool GetBoolean(int index) public bool? GetValue(int index) { return IsNull(index) - ? (bool?)null + ? null : BitUtility.GetBit(ValueBuffer.Span, index + Offset); } @@ -205,5 +205,30 @@ public bool GetBoolean(int index) } IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(bool? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(bool? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(bool? item) + { + for (int index = 0; index < Length; index++) + { + if (GetValue(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(bool?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetValue(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs index 6ab4986f573e2..55864e89e2eb3 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs @@ -23,9 +23,9 @@ namespace Apache.Arrow /// The class holds an array of dates in the Date32 format, where each date is /// stored as the number of days since the dawn of (UNIX) time. /// - public class Date32Array : PrimitiveArray, IReadOnlyList + public class Date32Array : PrimitiveArray, IReadOnlyList, ICollection #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { private static readonly DateTime _epochDate = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Unspecified); @@ -40,10 +40,9 @@ public class Builder : DateArrayBuilder { private class DateBuilder : PrimitiveArrayBuilder { - protected override Date32Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Date32Array(valueBuffer, nullBitmapBuffer, length, nullCount, offset); + protected override Date32Array Build(ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, int length, + int nullCount, int offset) => + new(valueBuffer, nullBitmapBuffer, length, nullCount, offset); } /// @@ -149,6 +148,31 @@ public Date32Array(ArrayData data) yield return GetDateOnly(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateOnly(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateOnly(srcIndex); + } + } #endif int IReadOnlyCollection.Count => Length; @@ -160,7 +184,32 @@ public Date32Array(ArrayData data) for (int index = 0; index < Length; index++) { yield return GetDateTime(index); - }; + } + } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateTime? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateTime?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateTime(srcIndex); + } } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs index 43e698e10b25c..77538ce59ffae 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs @@ -24,9 +24,9 @@ namespace Apache.Arrow /// stored as the number of milliseconds since the dawn of (UNIX) time, excluding leap seconds, in multiples of /// 86400000. /// - public class Date64Array : PrimitiveArray, IReadOnlyList + public class Date64Array : PrimitiveArray, IReadOnlyList, ICollection #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { private const long MillisecondsPerDay = 86400000; @@ -45,10 +45,9 @@ public class Builder : DateArrayBuilder { private class DateBuilder : PrimitiveArrayBuilder { - protected override Date64Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Date64Array(valueBuffer, nullBitmapBuffer, length, nullCount, offset); + protected override Date64Array Build(ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, int length, + int nullCount, int offset) => + new(valueBuffer, nullBitmapBuffer, length, nullCount, offset); } /// @@ -151,6 +150,31 @@ public Date64Array(ArrayData data) yield return GetDateOnly(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateOnly(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateOnly(srcIndex); + } + } #endif int IReadOnlyCollection.Count => Length; @@ -162,7 +186,32 @@ public Date64Array(ArrayData data) for (int index = 0; index < Length; index++) { yield return GetDateTime(index); - }; + } + } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateTime? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateTime?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateTime(srcIndex); + } } } } diff --git a/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs b/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs index de4fc42b4cf92..3949af877b0c5 100644 --- a/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs @@ -31,7 +31,7 @@ internal static class IntervalArray } public abstract class IntervalArray : PrimitiveArray - where T : struct + where T : struct, IEquatable { protected IntervalArray(ArrayData data) : base(data) diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs index 0456c5cc65ba4..05d659b5270ad 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs @@ -20,8 +20,8 @@ namespace Apache.Arrow { - public abstract class PrimitiveArray : Array, IReadOnlyList - where T : struct + public abstract class PrimitiveArray : Array, IReadOnlyList, ICollection + where T : struct, IEquatable { protected PrimitiveArray(ArrayData data) : base(data) @@ -40,7 +40,7 @@ protected PrimitiveArray(ArrayData data) { throw new ArgumentOutOfRangeException(nameof(index)); } - return IsValid(index) ? Values[index] : (T?)null; + return IsValid(index) ? Values[index] : null; } public IList ToList(bool includeNulls = false) @@ -86,5 +86,36 @@ IEnumerator IEnumerable.GetEnumerator() yield return IsValid(index) ? Values[index] : null; } } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(T? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(T? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(T? item) + { + if (item == null) + { + return NullCount > 0; + } + + ReadOnlySpan values = Values; + while (values.Length > 0) + { + int index = Values.IndexOf(item.Value); + if (index < 0 || IsValid(index)) { return index >= 0; } + values = values.Slice(index + 1); + } + return false; + } + + void ICollection.CopyTo(T?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetValue(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs index 67fe46633c18f..ae02173fb0df4 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs @@ -20,7 +20,7 @@ namespace Apache.Arrow { - public abstract class PrimitiveArrayBuilder : IArrowArrayBuilder + public abstract class PrimitiveArrayBuilder : IArrowArrayBuilder where TTo : struct where TArray : IArrowArray where TBuilder : class, IArrowArrayBuilder diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index a3ec596adc7ba..ab44805d8d1e9 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -22,7 +22,7 @@ namespace Apache.Arrow { - public class StringArray: BinaryArray, IReadOnlyList + public class StringArray: BinaryArray, IReadOnlyList, ICollection { public static readonly Encoding DefaultEncoding = Encoding.UTF8; @@ -164,5 +164,30 @@ IEnumerator IEnumerable.GetEnumerator() } IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(string item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(string item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(string item) + { + for (int index = 0; index < Length; index++) + { + if (GetString(index) == item) + return true; + } + + return false; + } + + void ICollection.CopyTo(string[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetString(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs index e9c2d7a4d9b28..63c0898935ba5 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs @@ -26,7 +26,7 @@ namespace Apache.Arrow /// public class Time32Array : PrimitiveArray #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { /// @@ -171,6 +171,31 @@ public Time32Array(ArrayData data) yield return GetTime(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(TimeOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(TimeOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetTime(srcIndex); + } + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs index fc18dfb8bf726..5518462952050 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs @@ -26,7 +26,7 @@ namespace Apache.Arrow /// public class Time64Array : PrimitiveArray #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { /// @@ -162,6 +162,31 @@ public Time64Array(ArrayData data) yield return GetTime(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(TimeOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(TimeOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetTime(srcIndex); + } + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs index ccb656854a5df..b83860584707e 100644 --- a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs @@ -21,7 +21,7 @@ namespace Apache.Arrow { - public class TimestampArray : PrimitiveArray, IReadOnlyList + public class TimestampArray : PrimitiveArray, IReadOnlyList, ICollection { private static readonly DateTimeOffset s_epoch = new DateTimeOffset(1970, 1, 1, 0, 0, 0, 0, TimeSpan.Zero); @@ -157,5 +157,30 @@ public DateTimeOffset GetTimestampUnchecked(int index) yield return GetTimestamp(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateTimeOffset? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateTimeOffset? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateTimeOffset? item) + { + for (int index = 0; index < Length; index++) + { + if (GetTimestamp(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateTimeOffset?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetTimestamp(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index b11479c0d4460..c66569afeba85 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -148,7 +148,7 @@ public void VisitArray(IArrowArray array) public void Visit(MonthDayNanosecondIntervalArray array) => VisitPrimitiveArray(array); private void VisitPrimitiveArray(PrimitiveArray array) - where T : struct + where T : struct, IEquatable { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); _buffers.Add(CreateSlicedBuffer(array.ValueBuffer, array.Offset, array.Length)); diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index 31a5676f01315..7232f74b8bec6 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -908,8 +908,8 @@ private static byte[] ConvertHexStringToByteArray(string hexString) }; private void GenerateArray(Func createArray) + where T : struct, IEquatable where TArray : PrimitiveArray - where T : struct { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -929,8 +929,8 @@ private void GenerateArray(Func(Func createArray, Func parse) + where T : struct, IEquatable where TArray : PrimitiveArray - where T : struct { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -950,8 +950,8 @@ private void GenerateLongArray(Func(Func createArray, Func construct) + where T : struct, IEquatable where TArray : PrimitiveArray - where T : struct { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index 682ebec323dc0..d3032b8d4ac40 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -101,9 +101,9 @@ public void EnumerateArray() { var array = new Int64Array.Builder().Append(1).Append(2).Build(); - foreach(long? foo in (IEnumerable)array) + foreach(long? foo in array) { - Assert.InRange(foo.Value, 1, 2); + Assert.InRange(foo!.Value, 1, 2); } foreach (object foo in (IEnumerable)array) @@ -115,12 +115,145 @@ public void EnumerateArray() [Fact] public void ArrayAsReadOnlyList() { - Int64Array array = new Int64Array.Builder().Append(1).Append(2).Build(); - var readOnlyList = (IReadOnlyList)array; + TestArrayAsReadOnlyList([1, 2]); + TestArrayAsReadOnlyList([1, 2]); + TestArrayAsReadOnlyList([true, false]); + TestArrayAsReadOnlyList([DateTime.MinValue.Date, DateTime.MaxValue.Date]); + TestArrayAsReadOnlyList([DateTime.MinValue.Date, DateTime.MaxValue.Date]); + TestArrayAsReadOnlyList([DateTimeOffset.MinValue, DateTimeOffset.MinValue.AddYears(100)]); + +#if NET5_0_OR_GREATER + TestArrayAsReadOnlyList([DateOnly.MinValue, DateOnly.MaxValue]); + TestArrayAsReadOnlyList([DateOnly.MinValue, DateOnly.MaxValue]); + TestArrayAsReadOnlyList([TimeOnly.MinValue, TimeOnly.MinValue.AddHours(23)]); + TestArrayAsReadOnlyList([TimeOnly.MinValue, TimeOnly.MaxValue]); + TestArrayAsReadOnlyList([(Half)1.1, (Half)2.2f]); +#endif + } + + // Parameter 'values' must contain two distinct values + private static void TestArrayAsReadOnlyList(IReadOnlyList values) + where T : struct + where TArray : IArrowArray + where TArrayBuilder : IArrowArrayBuilder, new() + { + Assert.Equal(2, values.Count); + TArray array = new TArrayBuilder().Append(values[0]).AppendNull().Append(values[1]).Build(default); + Assert.NotNull(array); + var readOnlyList = (IReadOnlyList)array; Assert.Equal(array.Length, readOnlyList.Count); - Assert.Equal(readOnlyList[0], 1); - Assert.Equal(readOnlyList[1], 2); + Assert.Equal(3, readOnlyList.Count); + Assert.Equal(values[0], readOnlyList[0]); + Assert.Null(readOnlyList[1]); + Assert.Equal(values[1], readOnlyList[2]); + } + + [Fact] + public void ArrayAsCollection() + { + TestPrimitiveArrayAsCollection([1, 2, 3, 4]); + TestPrimitiveArrayAsCollection([1, 2, 3, 4]); + TestPrimitiveArrayAsCollection([true, true, true, false]); + TestPrimitiveArrayAsCollection([DateTime.MinValue.Date, DateTime.MaxValue.Date, DateTime.Today, DateTime.Today]); + TestPrimitiveArrayAsCollection([DateTime.MinValue.Date, DateTime.MaxValue.Date, DateTime.Today, DateTime.Today]); + TestPrimitiveArrayAsCollection([DateTimeOffset.MinValue, DateTimeOffset.MinValue.AddYears(100), DateTimeOffset.Now, DateTimeOffset.UtcNow]); + +#if NET5_0_OR_GREATER + TestPrimitiveArrayAsCollection([DateOnly.MinValue, DateOnly.MaxValue, DateOnly.FromDayNumber(1), DateOnly.FromDayNumber(2)]); + TestPrimitiveArrayAsCollection([DateOnly.MinValue, DateOnly.MaxValue, DateOnly.FromDayNumber(1), DateOnly.FromDayNumber(2)]); + TestPrimitiveArrayAsCollection([TimeOnly.MinValue, TimeOnly.MinValue.AddHours(23), TimeOnly.MinValue.AddHours(1), TimeOnly.MinValue.AddHours(2)]); + TestPrimitiveArrayAsCollection([TimeOnly.MinValue, TimeOnly.MaxValue, TimeOnly.MinValue.AddHours(1), TimeOnly.MinValue.AddHours(2)]); + TestPrimitiveArrayAsCollection([(Half)1.1, (Half)2.2f, (Half)3.3f, (Half)4.4f]); +#endif + + byte[][] byteArrs = [new byte[1], [], [255], new byte[2]]; + TestObjectArrayAsCollection(new BinaryArray.Builder().Append(byteArrs[0].AsEnumerable()).AppendNull().Append(byteArrs[1].AsEnumerable()).Append(byteArrs[0].AsEnumerable()).Build(), System.Array.Empty(), byteArrs); + + string[] strings = ["abc", "abd", "acd", "adc"]; + TestObjectArrayAsCollection(new StringArray.Builder().Append(strings[0]).AppendNull().Append(strings[1]).Append(strings[0]).Build(), null, strings); + } + + // Parameter 'values' must contain four values. The last value must be distinct from the rest. + private static void TestPrimitiveArrayAsCollection(IReadOnlyList values) + where T : struct + where TArray : IArrowArray, ICollection + where TArrayBuilder : IArrowArrayBuilder, new() + { + Assert.Equal(4, values.Count); + TArray array = new TArrayBuilder().Append(values[0]).AppendNull().Append(values[1]).Append(values[0]).Build(default); + Assert.NotNull(array); + var collection = (ICollection)array; + + Assert.Equal(array.Length, collection.Count); + Assert.Equal(4, collection.Count); + Assert.True(collection.IsReadOnly); + + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Add(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Remove(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(collection.Clear).Message); + + Assert.True(collection.Contains(values[0])); + Assert.True(collection.Contains(values[1])); + Assert.True(collection.Contains(default)); + Assert.False(collection.Contains(values[3])); + + T sentinel = values[2]; + T?[] destArr = { sentinel, sentinel, sentinel, sentinel, sentinel, sentinel }; + collection.CopyTo(destArr, 1); + Assert.Equal(sentinel, destArr[0]); + Assert.Equal(values[0], destArr[1]); + Assert.Null(destArr[2]); + Assert.Equal(values[1], destArr[3]); + Assert.Equal(values[0], destArr[4]); + Assert.Equal(sentinel, destArr[0]); + } + + // Parameter 'values' must contain four values. The last value must be distinct from the rest. + private static void TestObjectArrayAsCollection(TArray array, T nullValue, IReadOnlyList values) + where T : class + where TArray : IArrowArray, ICollection + { + Assert.NotNull(array); + Assert.Equal(4, values.Count); + var collection = (ICollection)array; + + Assert.Equal(array.Length, collection.Count); + Assert.Equal(4, collection.Count); + Assert.True(collection.IsReadOnly); + + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Add(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Remove(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(collection.Clear).Message); + + Assert.True(collection.Contains(values[0])); + Assert.True(collection.Contains(values[1])); + Assert.True(collection.Contains(default)); + Assert.False(collection.Contains(values[3])); + + T sentinel = values[2]; + T?[] destArr = { sentinel, sentinel, sentinel, sentinel, sentinel, sentinel }; + collection.CopyTo(destArr, 1); + Assert.Equal(sentinel, destArr[0]); + Assert.Equal(values[0], destArr[1]); + Assert.Equal(nullValue, destArr[2]); + Assert.Equal(values[1], destArr[3]); + Assert.Equal(values[0], destArr[4]); + Assert.Equal(sentinel, destArr[0]); + } + + [Fact] + public void ContainsDoesNotMatchDefaultValueInArrayWithNullValue() + { + Int64Array array = new Int64Array.Builder().Append(1).Append(2).AppendNull().Build(); + Assert.NotNull(array); + var collection = (ICollection)array; + + Assert.True(collection.Contains(1)); + Assert.True(collection.Contains(2)); + Assert.True(collection.Contains(default)); + // A null value is stored as a null bit in the null bitmap, and a default value in the value buffer. Check that we do not match the default value. + Assert.False(collection.Contains(0)); } [Fact] diff --git a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs index 2a674b942c17b..6e4742cad06f2 100644 --- a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs @@ -131,7 +131,7 @@ public void AppendGivesUtcDate(DateTimeOffset dateTimeOffset) public class AppendDateOnly { [Theory] - [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date64ArrayTests))] + [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date32ArrayTests))] public void AppendDateGivesSameDate(DateOnly date) { // Arrange diff --git a/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs b/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs index 4375c39cdfaf6..01809735d14c9 100644 --- a/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs +++ b/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs @@ -14,8 +14,6 @@ // limitations under the License. using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Tests { diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs index 712a87a252b6c..c603ef63a4d3e 100644 --- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs @@ -110,7 +110,7 @@ private static void CompareValue(UnionArray originalArray, int originalIndex, Un } private static void CompareFieldValue(byte typeId, UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) - where T: struct + where T : struct, IEquatable where TArray : PrimitiveArray { if (originalArray is DenseUnionArray denseOriginalArray) From c555488c56c8de6d8020c8460b3b87081f7fb49d Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Tue, 14 May 2024 01:15:20 +0100 Subject: [PATCH 082/105] GH-41630: [Benchmarking] Fix out-of-source build in benchmarks (#41631) ### Rationale for this change Broken benchmarks after #41455 ### What changes are included in this PR? Use /tmp/arrow as build dir. ### Are these changes tested? ### Are there any user-facing changes? * GitHub Issue: #41630 Authored-by: Jacob Wujciak-Jens Signed-off-by: Jacob Wujciak-Jens --- dev/conbench_envs/hooks.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dev/conbench_envs/hooks.sh b/dev/conbench_envs/hooks.sh index a77189764aed3..0745357d2c0d3 100755 --- a/dev/conbench_envs/hooks.sh +++ b/dev/conbench_envs/hooks.sh @@ -59,7 +59,8 @@ build_arrow_cpp() { } build_arrow_python() { - ci/scripts/python_build.sh $(pwd) $(pwd) + mkdir -p /tmp/arrow + ci/scripts/python_build.sh $(pwd) /tmp/arrow } build_arrow_r() { @@ -69,7 +70,8 @@ build_arrow_r() { } build_arrow_java() { - ci/scripts/java_build.sh $(pwd) $(pwd) + mkdir -p /tmp/arrow + ci/scripts/java_build.sh $(pwd) /tmp/arrow } install_archery() { From fc7c723babce0bb6aae3e2b9653296cdb508578d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 May 2024 09:25:09 +0900 Subject: [PATCH 083/105] MINOR: [Go] Bump golang.org/x/tools from 0.20.0 to 0.21.0 in /go (#41639) Bumps [golang.org/x/tools](https://github.com/golang/tools) from 0.20.0 to 0.21.0.
Commits
  • cc29c91 go.mod: update golang.org/x dependencies
  • 397fef9 gopls/internal/protocol: add links to LSP spec
  • e2a352c internal/refactor/inline: extensible API
  • c16c816 go/analysis/passes/stdversion: test *.go < go.mod version
  • 629a7be go/analysis/analysistest: stricter errors and GOWORK setting
  • 4db1697 go/packages/packagestest: fold modules_111.go into modules.go
  • ccdef3c gopls/internal/golang: fix nil panic in InlayHint
  • 74c9cfe go/analysis: add Pass.ReadFile
  • 5ef4fc9 gopls/internal/golang/completion: fix the isEmptyInterface predicate
  • 77f691b internal/gcimporter: use Alias.Rhs, not unsafe hack
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/tools&package-manager=go_modules&previous-version=0.20.0&new-version=0.21.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go/go.mod | 6 +++--- go/go.sum | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/go/go.mod b/go/go.mod index 7c14ddcf9e216..8fdfea3dbe5eb 100644 --- a/go/go.mod +++ b/go/go.mod @@ -37,7 +37,7 @@ require ( golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 golang.org/x/sync v0.7.0 golang.org/x/sys v0.20.0 - golang.org/x/tools v0.20.0 + golang.org/x/tools v0.21.0 golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 google.golang.org/grpc v1.63.2 @@ -75,8 +75,8 @@ require ( github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.0 // indirect golang.org/x/mod v0.17.0 // indirect - golang.org/x/net v0.24.0 // indirect - golang.org/x/text v0.14.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/text v0.15.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de // indirect gopkg.in/yaml.v3 v3.0.1 // indirect modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect diff --git a/go/go.sum b/go/go.sum index 70e3a533d03f3..c2db1a72ccf2d 100644 --- a/go/go.sum +++ b/go/go.sum @@ -111,14 +111,14 @@ github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= -golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= -golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= +golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= -golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -126,10 +126,10 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/tools v0.20.0 h1:hz/CVckiOxybQvFw6h7b/q80NTr9IUQb4s1IIzW7KNY= -golang.org/x/tools v0.20.0/go.mod h1:WvitBU7JJf6A4jOdg4S1tviW9bhUxkgeCui/0JHctQg= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.21.0 h1:qc0xYgIbsSDt9EyWz05J5wfa7LOVW0YTLOXrqdLAWIw= +golang.org/x/tools v0.21.0/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.15.0 h1:2lYxjRbTYyxkJxlhC+LvJIx3SsANPdRybu1tGj9/OrQ= From fd84ec0b1a6bc5345de089e01cc9e8d235c458b6 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Tue, 14 May 2024 05:20:45 -0300 Subject: [PATCH 084/105] GH-39129 [Python] pa.array: add check for byte-swapped numpy arrays inside python objects (#41549) ### What changes are included in this PR? This PR introduces a check to verify if the dtype of the input numpy array is byte-swapped. If it is, a not-implemented exception is raised. This precaution prevents the data from being cast incorrectly as if it were in the correct byte order, which would lead to wrong data values. ### Are these changes tested? I added a new test to check if not-implemented exception is raised - for both old (primitive types) and new (composed types) code. ### Are there any user-facing changes? No changes in API, but old code which gave incorrect results now would fail with a not-implemented exception * GitHub Issue: #39129 Authored-by: Konstantin Malanchev Signed-off-by: Joris Van den Bossche --- .../src/arrow/python/python_to_arrow.cc | 4 ++++ python/pyarrow/tests/test_array.py | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 79da47567bf24..a2a325fde8dbd 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -874,6 +874,10 @@ class PyListConverter : public ListConverter { if (PyArray_NDIM(ndarray) != 1) { return Status::Invalid("Can only convert 1-dimensional array values"); } + if (PyArray_ISBYTESWAPPED(ndarray)) { + // TODO + return Status::NotImplemented("Byte-swapped arrays not supported"); + } const int64_t size = PyArray_SIZE(ndarray); RETURN_NOT_OK(AppendTo(this->list_type_, size)); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index dbe29c5730758..f1f946ecc7dfb 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3920,3 +3920,27 @@ def test_list_view_slice(list_view_type): j = sliced_array.offsets[1].as_py() assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() == [4] + + +@pytest.mark.parametrize('numpy_native_dtype', ['u2', 'i4', 'f8']) +def test_swapped_byte_order_fails(numpy_native_dtype): + # ARROW-39129 + + numpy_swapped_dtype = np.dtype(numpy_native_dtype).newbyteorder() + np_arr = np.arange(10, dtype=numpy_swapped_dtype) + + # Primitive type array, type is inferred from the numpy array + with pytest.raises(pa.ArrowNotImplementedError): + pa.array(np_arr) + + # Primitive type array, type is explicitly provided + with pytest.raises(pa.ArrowNotImplementedError): + pa.array(np_arr, type=pa.float64()) + + # List type array + with pytest.raises(pa.ArrowNotImplementedError): + pa.array([np_arr]) + + # Struct type array + with pytest.raises(pa.ArrowNotImplementedError): + pa.StructArray.from_arrays([np_arr], names=['a']) From d7c22601e7046bdcdc3b59eeb82be6ead2c96460 Mon Sep 17 00:00:00 2001 From: a-reich <73507369+a-reich@users.noreply.github.com> Date: Tue, 14 May 2024 07:47:55 -0400 Subject: [PATCH 085/105] GH-41464: [Python] Fix StructArray.sort() for by=None (#41495) ### Rationale for this change Closes issue https://github.com/apache/arrow/issues/41464. Fix `StructArray.sort` method's `by` param to work in the case of `by=None` which was documented to mean sort by all fields (the default), but would raise an exception. ### What changes are included in this PR? * Add a unit test with by=None in `test_struct_array_sort` that fails on main * Fix the sort method ### Are these changes tested? yes ### Are there any user-facing changes? yes * GitHub Issue: #41464 Authored-by: a-reich Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 7 +++---- python/pyarrow/tests/test_array.py | 8 ++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 946c82b258241..406830ad4dd69 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3920,12 +3920,11 @@ cdef class StructArray(Array): result : StructArray """ if by is not None: - tosort = self._flattened_field(by) + tosort, sort_keys = self._flattened_field(by), [("", order)] else: - tosort = self + tosort, sort_keys = self, [(field.name, order) for field in self.type] indices = _pc().sort_indices( - tosort, - options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) + tosort, options=_pc().SortOptions(sort_keys=sort_keys, **kwargs) ) return self.take(indices) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f1f946ecc7dfb..b89e0ace157af 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3536,6 +3536,14 @@ def test_struct_array_sort(): {"a": 5, "b": "foo"}, ] + sorted_arr = arr.sort() + assert sorted_arr.to_pylist() == [ + {"a": 5, "b": "foo"}, + {"a": 7, "b": "bar"}, + {"a": 7, "b": "car"}, + {"a": 35, "b": "foobar"}, + ] + arr_with_nulls = pa.StructArray.from_arrays([ pa.array([5, 7, 7, 35], type=pa.int64()), pa.array(["foo", "car", "bar", "foobar"]) From e6ab174e20137d62b33e4373f5fbd3c435948036 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Tue, 14 May 2024 20:56:49 +0800 Subject: [PATCH 086/105] GH-41329: [C++][Gandiva] Fix gandiva cache size env var (#41330) ### Rationale for this change Gandiva cache size validity checks are not robust enough (the negativity test is broken), and they are not currently tested. ### What changes are included in this PR? 1. Fix checking gandiva cache size env var. 2. Make cache size static so it only gets evaluated once. 3. Add test cases. 4. Enrich the description in the document about this env var. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: #41329 Lead-authored-by: Ruoxi Sun Co-authored-by: Rossi Sun Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/gandiva/cache.cc | 47 +++++++++++++++--------- cpp/src/gandiva/cache.h | 15 +++++++- cpp/src/gandiva/cache_test.cc | 68 ++++++++++++++++++++++++++++++++++- docs/source/cpp/env_vars.rst | 4 +++ 4 files changed, 116 insertions(+), 18 deletions(-) diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index a1333ccdc5d43..2358b08c82424 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -20,26 +20,41 @@ #include "arrow/result.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace gandiva { -static const size_t DEFAULT_CACHE_SIZE = 5000; - -int GetCapacity() { - size_t capacity = DEFAULT_CACHE_SIZE; - auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); - if (maybe_env_cache_size.ok()) { - const auto env_cache_size = *std::move(maybe_env_cache_size); - if (!env_cache_size.empty()) { - capacity = std::atol(env_cache_size.c_str()); - if (capacity <= 0) { - ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " - << "Using default cache size: " << DEFAULT_CACHE_SIZE; - capacity = DEFAULT_CACHE_SIZE; - } - } +constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE"; +constexpr auto kDefaultCacheSize = 5000; + +namespace internal { +int GetCacheCapacityFromEnvVar() { + auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar); + if (!maybe_env_value.ok()) { + return kDefaultCacheSize; + } + const auto env_value = *std::move(maybe_env_value); + if (env_value.empty()) { + return kDefaultCacheSize; + } + int capacity = 0; + bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>( + env_value.c_str(), env_value.size(), &capacity); + if (!ok || capacity <= 0) { + ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar + << ". Using default cache size: " << kDefaultCacheSize; + return kDefaultCacheSize; } - return static_cast(capacity); + return capacity; +} +} // namespace internal + +// Deprecated in 17.0.0. Use GetCacheCapacity instead. +int GetCapacity() { return GetCacheCapacity(); } + +int GetCacheCapacity() { + static const int capacity = internal::GetCacheCapacityFromEnvVar(); + return capacity; } void LogCacheSize(size_t capacity) { diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index 7cff9b02692ae..c19dbb7a0e30e 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -20,14 +20,27 @@ #include #include +#include "arrow/util/macros.h" #include "gandiva/lru_cache.h" #include "gandiva/visibility.h" namespace gandiva { +namespace internal { +// Only called once by GetCacheCapacity(). +// Do the actual work of getting the cache capacity from env var. +// Also makes the testing easier. +GANDIVA_EXPORT +int GetCacheCapacityFromEnvVar(); +} // namespace internal + +ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.") GANDIVA_EXPORT int GetCapacity(); +GANDIVA_EXPORT +int GetCacheCapacity(); + GANDIVA_EXPORT void LogCacheSize(size_t capacity); @@ -36,7 +49,7 @@ class Cache { public: explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); } - Cache() : Cache(GetCapacity()) {} + Cache() : Cache(GetCacheCapacity()) {} ValueType GetObjectCode(const KeyType& cache_key) { std::optional result; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index a146707079fa6..96cf4a12e587a 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -16,10 +16,14 @@ // under the License. #include "gandiva/cache.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" #include namespace gandiva { + class TestCacheKey { public: explicit TestCacheKey(int value) : value_(value) {} @@ -38,5 +42,67 @@ TEST(TestCache, TestGetPut) { ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); } -TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); } +namespace { +constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE"; +constexpr auto default_cache_capacity = 5000; +} // namespace + +TEST(TestCache, TestGetCacheCapacityDefault) { + ASSERT_EQ(GetCacheCapacity(), default_cache_capacity); +} + +TEST(TestCache, TestGetCacheCapacityEnvVar) { + using ::arrow::EnvVarGuard; + + // Empty. + { + EnvVarGuard guard(cache_capacity_env_var, ""); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Non-number. + { + EnvVarGuard guard(cache_capacity_env_var, "invalid"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Number with invalid suffix. + { + EnvVarGuard guard(cache_capacity_env_var, "42MB"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Valid positive number. + { + EnvVarGuard guard(cache_capacity_env_var, "42"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42); + } + + // Int max. + { + auto str = std::to_string(std::numeric_limits::max()); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits::max()); + } + + // Zero. + { + EnvVarGuard guard(cache_capacity_env_var, "0"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Negative number. + { + EnvVarGuard guard(cache_capacity_env_var, "-1"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Over int max. + { + auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } +} + } // namespace gandiva diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 116c151824c75..0a082b0a5d859 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -181,6 +181,10 @@ that changing their value later will have an effect. The number of entries to keep in the Gandiva JIT compilation cache. The cache is in-memory and does not persist across processes. + The default cache size is 5000. The value of this environment variable + should be a positive integer and should not exceed the maximum value + of int32. Otherwise the default value is used. + .. envvar:: HADOOP_HOME The path to the Hadoop installation. From ada965ff8b93320105937f76815cb6ce6e5c855e Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 15 May 2024 00:03:24 +0800 Subject: [PATCH 087/105] GH-33484: [C++][Compute] Implement `Grouper::Reset` (#41352) ### Rationale for this change Recently I've been working on some improvement for `Grouper` and I found adding `Reset` function could be beneficial. Then I trace down to #33484 from a TODO in code. Here comes this PR. ### What changes are included in this PR? Add `Reset` function for all the concrete `Grouper` implementations, and eliminate the recreation of `Grouper` in `AnyKeysSegmenter`. Also add more `RowSegmenter` cases covering `AnyKeysSegmenter`. ### Are these changes tested? Yes. Legacy UTs should cover it well. Also added some new UTs. ### Are there any user-facing changes? None. * GitHub Issue: #33484 Lead-authored-by: Ruoxi Sun Co-authored-by: Rossi Sun Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/acero/hash_aggregate_test.cc | 172 ++++++++++++++++----- cpp/src/arrow/compute/row/grouper.cc | 43 ++++-- cpp/src/arrow/compute/row/grouper.h | 4 + 3 files changed, 168 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 2626fd50379dd..d529f443319b9 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -592,6 +592,12 @@ void TestSegments(std::unique_ptr& segmenter, const ExecSpan& batc ASSERT_EQ(expected_segment, segment); offset = segment.offset + segment.length; } + // Assert next is the last (empty) segment. + ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch, offset)); + ASSERT_GE(segment.offset, batch.length); + ASSERT_EQ(segment.length, 0); + ASSERT_TRUE(segment.is_open); + ASSERT_TRUE(segment.extends); } Result> MakeGrouper(const std::vector& key_types) { @@ -682,48 +688,142 @@ TEST(RowSegmenter, Basics) { } TEST(RowSegmenter, NonOrdered) { - std::vector types = {int32()}; - auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 1, true, false}, - {5, 0, true, true}}); + { + std::vector types = {int32()}; + auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 1, true, false}, + {5, 0, true, true}}); + } + { + std::vector types = {int32(), int32()}; + auto batch = ExecBatchFromJSON(types, "[[1, 1], [1, 1], [2, 2], [1, 2], [2, 2]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 1, true, false}, + {5, 0, true, true}}); + } } TEST(RowSegmenter, EmptyBatches) { - std::vector types = {int32()}; - std::vector batches = { - ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[2], [2]]"), ExecBatchFromJSON(types, "[]"), - }; - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batches[0]), {}); - TestSegments(segmenter, ExecSpan(batches[1]), {}); - TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[3]), {}); - TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[5]), {}); - TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); - TestSegments(segmenter, ExecSpan(batches[7]), {}); + { + std::vector types = {int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[2], [2]]"), ExecBatchFromJSON(types, "[]"), + }; + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {}); + TestSegments(segmenter, ExecSpan(batches[1]), {}); + TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[3]), {}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[5]), {}); + TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[7]), {}); + } + { + std::vector types = {int32(), int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[2, 2], [2, 2]]"), + ExecBatchFromJSON(types, "[]"), + }; + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {}); + TestSegments(segmenter, ExecSpan(batches[1]), {}); + TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[3]), {}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[5]), {}); + TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[7]), {}); + } } TEST(RowSegmenter, MultipleSegments) { - std::vector types = {int32()}; - auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], [4]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 2, false, false}, - {6, 2, false, false}, - {8, 1, true, false}, - {9, 0, true, true}}); + { + std::vector types = {int32()}; + auto batch = + ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], [4]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 2, false, false}, + {6, 2, false, false}, + {8, 1, true, false}, + {9, 0, true, true}}); + } + { + std::vector types = {int32(), int32()}; + auto batch = ExecBatchFromJSON( + types, + "[[1, 1], [1, 1], [2, 2], [5, 5], [3, 3], [3, 3], [5, 5], [5, 5], [4, 4]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 2, false, false}, + {6, 2, false, false}, + {8, 1, true, false}, + {9, 0, true, true}}); + } +} + +TEST(RowSegmenter, MultipleSegmentsMultipleBatches) { + { + std::vector types = {int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[[1], [2]]"), + ExecBatchFromJSON(types, "[[5], [3]]"), + ExecBatchFromJSON(types, "[[3], [5], [5]]"), ExecBatchFromJSON(types, "[[4]]")}; + + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[1]), + {{0, 1, false, true}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[2]), + {{0, 1, false, false}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[3]), + {{0, 1, false, true}, {1, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); + } + { + std::vector types = {int32(), int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[[1, 1], [2, 2]]"), + ExecBatchFromJSON(types, "[[5, 5], [3, 3]]"), + ExecBatchFromJSON(types, "[[3, 3], [5, 5], [5, 5]]"), + ExecBatchFromJSON(types, "[[4, 4]]")}; + + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[1]), + {{0, 1, false, true}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[2]), + {{0, 1, false, false}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[3]), + {{0, 1, false, true}, {1, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); + } } namespace { diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index 756c70967ac6f..50ca20bd14f31 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -217,18 +217,18 @@ struct SimpleKeySegmenter : public BaseRowSegmenter { struct AnyKeysSegmenter : public BaseRowSegmenter { static Result> Make( const std::vector& key_types, ExecContext* ctx) { - ARROW_RETURN_NOT_OK(Grouper::Make(key_types, ctx)); // check types - return std::make_unique(key_types, ctx); + ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_types, ctx)); // check types + return std::make_unique(key_types, ctx, std::move(grouper)); } - AnyKeysSegmenter(const std::vector& key_types, ExecContext* ctx) + AnyKeysSegmenter(const std::vector& key_types, ExecContext* ctx, + std::unique_ptr grouper) : BaseRowSegmenter(key_types), - ctx_(ctx), - grouper_(nullptr), + grouper_(std::move(grouper)), save_group_id_(kNoGroupId) {} Status Reset() override { - grouper_ = nullptr; + ARROW_RETURN_NOT_OK(grouper_->Reset()); save_group_id_ = kNoGroupId; return Status::OK(); } @@ -245,7 +245,6 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { // first row of a new segment to see if it extends the previous segment. template Result MapGroupIdAt(const Batch& batch, int64_t offset) { - if (!grouper_) return kNoGroupId; ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch, offset, /*length=*/1)); if (!datum.is_array()) { @@ -264,9 +263,6 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { if (offset == batch.length) { return MakeSegment(batch.length, offset, 0, kEmptyExtends); } - // ARROW-18311: make Grouper support Reset() - // so it can be reset instead of recreated below - // // the group id must be computed prior to resetting the grouper, since it is compared // to save_group_id_, and after resetting the grouper produces incomparable group ids ARROW_ASSIGN_OR_RAISE(auto group_id, MapGroupIdAt(batch, offset)); @@ -276,7 +272,7 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { return extends; }; // resetting drops grouper's group-ids, freeing-up memory for the next segment - ARROW_ASSIGN_OR_RAISE(grouper_, Grouper::Make(key_types_, ctx_)); // TODO: reset it + ARROW_RETURN_NOT_OK(grouper_->Reset()); // GH-34475: cache the grouper-consume result across invocations of GetNextSegment ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch, offset)); if (datum.is_array()) { @@ -299,7 +295,6 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { } private: - ExecContext* const ctx_; std::unique_ptr grouper_; group_id_t save_group_id_; }; @@ -354,6 +349,7 @@ struct GrouperNoKeysImpl : Grouper { RETURN_NOT_OK(builder->Finish(&array)); return std::move(array); } + Status Reset() override { return Status::OK(); } Result Consume(const ExecSpan& batch, int64_t offset, int64_t length) override { ARROW_ASSIGN_OR_RAISE(auto array, MakeConstantGroupIdArray(length, 0)); return Datum(array); @@ -419,6 +415,14 @@ struct GrouperImpl : public Grouper { return std::move(impl); } + Status Reset() override { + map_.clear(); + offsets_.clear(); + key_bytes_.clear(); + num_groups_ = 0; + return Status::OK(); + } + Result Consume(const ExecSpan& batch, int64_t offset, int64_t length) override { ARROW_RETURN_NOT_OK(CheckAndCapLengthForConsume(batch.length, offset, &length)); if (offset != 0 || length != batch.length) { @@ -595,7 +599,17 @@ struct GrouperFastImpl : public Grouper { return std::move(impl); } - ~GrouperFastImpl() { map_.cleanup(); } + Status Reset() override { + rows_.Clean(); + rows_minibatch_.Clean(); + map_.cleanup(); + RETURN_NOT_OK(map_.init(encode_ctx_.hardware_flags, ctx_->memory_pool())); + // TODO: It is now assumed that the dictionaries_ are identical to the first batch + // throughout the grouper's lifespan so no resetting is needed. But if we want to + // support different dictionaries for different batches, we need to reset the + // dictionaries_ here. + return Status::OK(); + } Result Consume(const ExecSpan& batch, int64_t offset, int64_t length) override { ARROW_RETURN_NOT_OK(CheckAndCapLengthForConsume(batch.length, offset, &length)); @@ -838,8 +852,7 @@ struct GrouperFastImpl : public Grouper { return out; } - static constexpr int log_minibatch_max_ = 10; - static constexpr int minibatch_size_max_ = 1 << log_minibatch_max_; + static constexpr int minibatch_size_max_ = arrow::util::MiniBatch::kMiniBatchLength; static constexpr int minibatch_size_min_ = 128; int minibatch_size_; diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index 628a9c14f3e44..a883fb938ddaf 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -109,6 +109,10 @@ class ARROW_EXPORT Grouper { static Result> Make(const std::vector& key_types, ExecContext* ctx = default_exec_context()); + /// Reset all intermediate state, make the grouper logically as just `Make`ed. + /// The underlying buffers, if any, may or may not be released though. + virtual Status Reset() = 0; + /// Consume a batch of keys, producing the corresponding group ids as an integer array, /// over a slice defined by an offset and length, which defaults to the batch length. /// Currently only uint32 indices will be produced, eventually the bit width will only From a4a5cf1fbe804f5b47184afe91b3c243e0487ab2 Mon Sep 17 00:00:00 2001 From: David Sisson Date: Tue, 14 May 2024 09:28:50 -0700 Subject: [PATCH 088/105] GH-34484: [Substrait] add an option to disable augmented fields (#41583) ### Rationale for this change Augmented fields interfere with the schema passing between nodes. When enabled they cause names/schema mismatching at the end of the plan. ### What changes are included in this PR? Adds an option to disable augmented fields (defaulting to adding them), connects it everywhere it is called, and disables it in ReadRel conversion. ### Are these changes tested? Yes. ### Are there any user-facing changes? There are no API related changes however this will allow Substrait plans that consume local files to work without requiring a project/emit relation after the read relation to remove the unexpected fields. * GitHub Issue: #34484 Authored-by: David Sisson Signed-off-by: Matt Topol --- cpp/src/arrow/acero/sink_node.cc | 1 + cpp/src/arrow/dataset/discovery_test.cc | 3 +- cpp/src/arrow/dataset/file_parquet_test.cc | 5 +- cpp/src/arrow/dataset/scanner.cc | 35 +++++--- cpp/src/arrow/dataset/scanner.h | 9 ++- cpp/src/arrow/dataset/scanner_test.cc | 12 ++- cpp/src/arrow/dataset/test_util_internal.h | 18 +++-- .../engine/substrait/relation_internal.cc | 1 + cpp/src/arrow/engine/substrait/serde_test.cc | 81 +++++++++++++++++++ 9 files changed, 138 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/acero/sink_node.cc b/cpp/src/arrow/acero/sink_node.cc index 4ab6b4537de02..66f447aa87f11 100644 --- a/cpp/src/arrow/acero/sink_node.cc +++ b/cpp/src/arrow/acero/sink_node.cc @@ -423,6 +423,7 @@ class ConsumingSinkNode : public ExecNode, std::atomic backpressure_counter_ = 0; std::unique_ptr sequencer_; }; + static Result MakeTableConsumingSinkNode(ExecPlan* plan, std::vector inputs, const ExecNodeOptions& options) { diff --git a/cpp/src/arrow/dataset/discovery_test.cc b/cpp/src/arrow/dataset/discovery_test.cc index 92cec7f324963..981146b7999ef 100644 --- a/cpp/src/arrow/dataset/discovery_test.cc +++ b/cpp/src/arrow/dataset/discovery_test.cc @@ -144,7 +144,8 @@ class FileSystemDatasetFactoryTest : public DatasetFactoryTest { } options_ = std::make_shared(); options_->dataset_schema = schema; - ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::Default(*schema)); + ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::Default( + *schema, options_->add_augmented_fields)); SetProjection(options_.get(), std::move(projection)); ASSERT_OK_AND_ASSIGN(dataset_, factory_->Finish(schema)); ASSERT_OK_AND_ASSIGN(auto fragment_it, dataset_->GetFragments()); diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 76cd0af3b835f..bf626826d4d1b 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -330,8 +330,9 @@ TEST_F(TestParquetFileFormat, CachedMetadata) { // Read the file the first time, will read metadata auto options = std::make_shared(); options->filter = literal(true); - ASSERT_OK_AND_ASSIGN(auto projection_descr, - ProjectionDescr::FromNames({"x"}, *test_schema)); + ASSERT_OK_AND_ASSIGN( + auto projection_descr, + ProjectionDescr::FromNames({"x"}, *test_schema, options->add_augmented_fields)); options->projected_schema = projection_descr.schema; options->projection = projection_descr.expression; ASSERT_OK_AND_ASSIGN(auto generator, fragment->ScanBatchesAsync(options)); diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 18981d1451980..a856a792a264f 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -211,7 +211,8 @@ Status NormalizeScanOptions(const std::shared_ptr& scan_options, // create the projected schema only if the provided expressions // produces valid set of fields. ARROW_ASSIGN_OR_RAISE(auto projection_descr, - ProjectionDescr::Default(*projected_schema)); + ProjectionDescr::Default( + *projected_schema, scan_options->add_augmented_fields)); scan_options->projected_schema = std::move(projection_descr.schema); scan_options->projection = projection_descr.expression; ARROW_ASSIGN_OR_RAISE(scan_options->projection, @@ -220,7 +221,8 @@ Status NormalizeScanOptions(const std::shared_ptr& scan_options, // if projected_fields are not found, we default to creating the projected_schema // and projection from the dataset_schema. ARROW_ASSIGN_OR_RAISE(auto projection_descr, - ProjectionDescr::Default(*dataset_schema)); + ProjectionDescr::Default( + *dataset_schema, scan_options->add_augmented_fields)); scan_options->projected_schema = std::move(projection_descr.schema); scan_options->projection = projection_descr.expression; } @@ -231,7 +233,7 @@ Status NormalizeScanOptions(const std::shared_ptr& scan_options, ARROW_ASSIGN_OR_RAISE( auto projection_descr, ProjectionDescr::FromNames(scan_options->projected_schema->field_names(), - *dataset_schema)); + *dataset_schema, scan_options->add_augmented_fields)); scan_options->projection = projection_descr.expression; } @@ -730,7 +732,8 @@ Future AsyncScanner::CountRowsAsync(Executor* executor) { const auto options = std::make_shared(*scan_options_); ARROW_ASSIGN_OR_RAISE(auto empty_projection, ProjectionDescr::FromNames(std::vector(), - *scan_options_->dataset_schema)); + *scan_options_->dataset_schema, + scan_options_->add_augmented_fields)); SetProjection(options.get(), empty_projection); auto total = std::make_shared>(0); @@ -828,7 +831,8 @@ Result ProjectionDescr::FromExpressions( } Result ProjectionDescr::FromNames(std::vector names, - const Schema& dataset_schema) { + const Schema& dataset_schema, + bool add_augmented_fields) { std::vector exprs(names.size()); for (size_t i = 0; i < exprs.size(); ++i) { // If name isn't in schema, try finding it by dotted path. @@ -846,15 +850,19 @@ Result ProjectionDescr::FromNames(std::vector name } } auto fields = dataset_schema.fields(); - for (const auto& aug_field : kAugmentedFields) { - fields.push_back(aug_field); + if (add_augmented_fields) { + for (const auto& aug_field : kAugmentedFields) { + fields.push_back(aug_field); + } } return ProjectionDescr::FromExpressions(std::move(exprs), std::move(names), Schema(fields, dataset_schema.metadata())); } -Result ProjectionDescr::Default(const Schema& dataset_schema) { - return ProjectionDescr::FromNames(dataset_schema.field_names(), dataset_schema); +Result ProjectionDescr::Default(const Schema& dataset_schema, + bool add_augmented_fields) { + return ProjectionDescr::FromNames(dataset_schema.field_names(), dataset_schema, + add_augmented_fields); } void SetProjection(ScanOptions* options, ProjectionDescr projection) { @@ -899,7 +907,8 @@ const std::shared_ptr& ScannerBuilder::projected_schema() const { Status ScannerBuilder::Project(std::vector columns) { ARROW_ASSIGN_OR_RAISE( auto projection, - ProjectionDescr::FromNames(std::move(columns), *scan_options_->dataset_schema)); + ProjectionDescr::FromNames(std::move(columns), *scan_options_->dataset_schema, + scan_options_->add_augmented_fields)); SetProjection(scan_options_.get(), std::move(projection)); return Status::OK(); } @@ -1052,8 +1061,10 @@ Result MakeScanNode(acero::ExecPlan* plan, }); auto fields = scan_options->dataset_schema->fields(); - for (const auto& aug_field : kAugmentedFields) { - fields.push_back(aug_field); + if (scan_options->add_augmented_fields) { + for (const auto& aug_field : kAugmentedFields) { + fields.push_back(aug_field); + } } return acero::MakeExecNode( diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 4479158ff20cc..d2de267897180 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -114,6 +114,9 @@ struct ARROW_DS_EXPORT ScanOptions { /// Note: This must be true in order for any readahead to happen bool use_threads = false; + /// If true the scanner will add augmented fields to the output schema. + bool add_augmented_fields = true; + /// Fragment-specific scan options. std::shared_ptr fragment_scan_options; @@ -287,10 +290,12 @@ struct ARROW_DS_EXPORT ProjectionDescr { /// \brief Create a default projection referencing fields in the dataset schema static Result FromNames(std::vector names, - const Schema& dataset_schema); + const Schema& dataset_schema, + bool add_augmented_fields = true); /// \brief Make a projection that projects every field in the dataset schema - static Result Default(const Schema& dataset_schema); + static Result Default(const Schema& dataset_schema, + bool add_augmented_fields = true); }; /// \brief Utility method to set the projection expression and schema diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index fccfc80032d31..58bc9c8c0ea6b 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1103,7 +1103,8 @@ TEST_P(TestScanner, ProjectionDefaults) { } // If we only specify a projection expression then infer the projected schema // from the projection expression - auto projection_desc = ProjectionDescr::FromNames({"i32"}, *schema_); + auto projection_desc = + ProjectionDescr::FromNames({"i32"}, *schema_, /*add_augmented_fields=*/true); { ARROW_SCOPED_TRACE("User only specifies projection"); options_->projection = projection_desc->expression; @@ -1148,7 +1149,8 @@ TEST_P(TestScanner, ProjectedScanNestedFromNames) { }); ASSERT_OK_AND_ASSIGN(auto descr, ProjectionDescr::FromNames({".struct.i32", "nested.right.f64"}, - *options_->dataset_schema)) + *options_->dataset_schema, + options_->add_augmented_fields)) SetProjection(options_.get(), std::move(descr)); auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_); auto batch_out = ConstantArrayGenerator::Zeroes( @@ -2106,7 +2108,8 @@ TEST(ScanOptions, TestMaterializedFields) { auto set_projection_from_names = [&opts](std::vector names) { ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::FromNames( - std::move(names), *opts->dataset_schema)); + std::move(names), *opts->dataset_schema, + opts->add_augmented_fields)); SetProjection(opts.get(), std::move(projection)); }; @@ -2160,7 +2163,8 @@ TEST(ScanOptions, TestMaterializedFields) { // project top-level field, filter nothing opts->filter = literal(true); ASSERT_OK_AND_ASSIGN(projection, - ProjectionDescr::FromNames({"nested"}, *opts->dataset_schema)); + ProjectionDescr::FromNames({"nested"}, *opts->dataset_schema, + opts->add_augmented_fields)); SetProjection(opts.get(), std::move(projection)); EXPECT_THAT(opts->MaterializedFields(), ElementsAre(FieldRef("nested"))); diff --git a/cpp/src/arrow/dataset/test_util_internal.h b/cpp/src/arrow/dataset/test_util_internal.h index de0519afac9e1..8195218b0cfe8 100644 --- a/cpp/src/arrow/dataset/test_util_internal.h +++ b/cpp/src/arrow/dataset/test_util_internal.h @@ -386,7 +386,8 @@ class DatasetFixtureMixin : public ::testing::Test { options_ = std::make_shared(); options_->dataset_schema = schema_; ASSERT_OK_AND_ASSIGN(auto projection, - ProjectionDescr::FromNames(schema_->field_names(), *schema_)); + ProjectionDescr::FromNames(schema_->field_names(), *schema_, + options_->add_augmented_fields)); SetProjection(options_.get(), std::move(projection)); SetFilter(literal(true)); } @@ -398,7 +399,8 @@ class DatasetFixtureMixin : public ::testing::Test { void SetProjectedColumns(std::vector column_names) { ASSERT_OK_AND_ASSIGN( auto projection, - ProjectionDescr::FromNames(std::move(column_names), *options_->dataset_schema)); + ProjectionDescr::FromNames(std::move(column_names), *options_->dataset_schema, + /*add_augmented_fields=*/true)); SetProjection(options_.get(), std::move(projection)); } @@ -502,7 +504,8 @@ class FileFormatFixtureMixin : public ::testing::Test { void SetSchema(std::vector> fields) { opts_->dataset_schema = schema(std::move(fields)); ASSERT_OK_AND_ASSIGN(auto projection, - ProjectionDescr::Default(*opts_->dataset_schema)); + ProjectionDescr::Default(*opts_->dataset_schema, + /*add_augmented_fields=*/true)); SetProjection(opts_.get(), std::move(projection)); } @@ -512,7 +515,8 @@ class FileFormatFixtureMixin : public ::testing::Test { void Project(std::vector names) { ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::FromNames( - std::move(names), *opts_->dataset_schema)); + std::move(names), *opts_->dataset_schema, + /*add_augmented_fields=*/true)); SetProjection(opts_.get(), std::move(projection)); } @@ -993,7 +997,8 @@ class FileFormatScanMixin : public FileFormatFixtureMixin, auto i64 = field("i64", int64()); this->opts_->dataset_schema = schema({i32, i32, i64}); ASSERT_RAISES(Invalid, - ProjectionDescr::FromNames({"i32"}, *this->opts_->dataset_schema)); + ProjectionDescr::FromNames({"i32"}, *this->opts_->dataset_schema, + /*add_augmented_fields=*/true)); } void TestScanWithPushdownNulls() { // Regression test for ARROW-15312 @@ -1933,7 +1938,8 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { scan_options_->dataset_schema = dataset_->schema(); ASSERT_OK_AND_ASSIGN( auto projection, - ProjectionDescr::FromNames(source_schema_->field_names(), *dataset_->schema())); + ProjectionDescr::FromNames(source_schema_->field_names(), *dataset_->schema(), + scan_options_->add_augmented_fields)); SetProjection(scan_options_.get(), std::move(projection)); } diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc b/cpp/src/arrow/engine/substrait/relation_internal.cc index f15f1a5527b7b..7c462c418f81b 100644 --- a/cpp/src/arrow/engine/substrait/relation_internal.cc +++ b/cpp/src/arrow/engine/substrait/relation_internal.cc @@ -393,6 +393,7 @@ Result FromProto(const substrait::Rel& rel, const ExtensionSet& auto scan_options = std::make_shared(); scan_options->use_threads = true; + scan_options->add_augmented_fields = false; if (read.has_filter()) { ARROW_ASSIGN_OR_RAISE(scan_options->filter, diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 3e80192377937..6762d1e045450 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -1064,6 +1064,86 @@ NamedTableProvider AlwaysProvideSameTable(std::shared_ptr
table) { }; } +TEST(Substrait, ExecReadRelWithLocalFiles) { + ASSERT_OK_AND_ASSIGN(std::string dir_string, + arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); + + std::string substrait_json = R"({ + "relations": [ + { + "root": { + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "f32", + "f64" + ], + "struct": { + "types": [ + { + "fp32": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fp64": { + "nullability": "NULLABILITY_REQUIRED" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "localFiles": { + "items": [ + { + "uriFile": "file://[DIRECTORY_PLACEHOLDER]/byte_stream_split.zstd.parquet", + "parquet": {} + } + ] + } + } + }, + "names": [ + "f32", + "f64" + ] + } + } + ], + "version": { + "minorNumber": 42, + "producer": "my-producer" + } + })"; + const char* placeholder = "[DIRECTORY_PLACEHOLDER]"; + substrait_json.replace(substrait_json.find(placeholder), strlen(placeholder), + dir_string); + + ASSERT_OK_AND_ASSIGN(auto buf, + internal::SubstraitFromJSON("Plan", substrait_json, + /*ignore_unknown_fields=*/false)); + + ASSERT_OK_AND_ASSIGN(auto declarations, + DeserializePlans(*buf, acero::NullSinkNodeConsumer::Make)); + ASSERT_EQ(declarations.size(), 1); + acero::Declaration* decl = &declarations[0]; + ASSERT_EQ(decl->factory_name, "consuming_sink"); + ASSERT_OK_AND_ASSIGN(auto plan, acero::ExecPlan::Make()); + ASSERT_OK_AND_ASSIGN(auto sink_node, declarations[0].AddToPlan(plan.get())); + ASSERT_STREQ(sink_node->kind_name(), "ConsumingSinkNode"); + ASSERT_EQ(sink_node->num_inputs(), 1); + auto& prev_node = sink_node->inputs()[0]; + ASSERT_STREQ(prev_node->kind_name(), "SourceNode"); + + plan->StartProducing(); + ASSERT_FINISHES_OK(plan->finished()); +} + TEST(Substrait, RelWithHint) { ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON("Rel", R"({ @@ -2443,6 +2523,7 @@ TEST(SubstraitRoundTrip, BasicPlanEndToEnd) { auto scan_options = std::make_shared(); scan_options->projection = compute::project({}, {}); + scan_options->add_augmented_fields = false; const std::string filter_col_left = "shared"; const std::string filter_col_right = "distinct"; auto comp_left_value = compute::field_ref(filter_col_left); From 8f27e269cb4c9fc9b593177f30bf9a1ec6ef5cff Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 15 May 2024 00:40:53 +0800 Subject: [PATCH 089/105] GH-41149: [C++][Acero] Fix asof join race (#41614) ### Rationale for this change Sporadic asof join test failures have been frequently and annoyingly observed in pyarrow CI, as recorded in #40675 and #41149. Turns out the root causes are the same - a logical race (as opposed to physical race which can be detected by sanitizers). By injecting special delay in various places in asof join, as shown in https://github.com/zanmato1984/arrow/commit/ea3b24c5f7308fe42f60dad41f51dbcbc1a54929, the issue can be reproduced almost 100%. And I have put some descriptions in that commit to explain how the race happens. ### What changes are included in this PR? Eliminate the logical race of emptiness by combining multiple call-sites of `Empty()`. ### Are these changes tested? Include the UT to reproduce the issue. ### Are there any user-facing changes? None. **This PR contains a "Critical Fix".** In #40675 and #41149 , incorrect results are produced. * GitHub Issue: #41149 * Also closes #40675 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- cpp/src/arrow/acero/asof_join_node.cc | 73 +++++++++++++--------- cpp/src/arrow/acero/asof_join_node_test.cc | 54 ++++++++++++++++ 2 files changed, 98 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 48cc83dd3d6a9..1d94467df9ee2 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -548,8 +548,10 @@ class InputState { // true when the queue is empty and, when memo may have future entries (the case of a // positive tolerance), when the memo is empty. // used when checking whether RHS is up to date with LHS. - bool CurrentEmpty() const { - return memo_.no_future_ ? Empty() : memo_.times_.empty() && Empty(); + // NOTE: The emptiness must be decided by a single call to Empty() in caller, due to the + // potential race with Push(), see GH-41614. + bool CurrentEmpty(bool empty) const { + return memo_.no_future_ ? empty : (memo_.times_.empty() && empty); } // in case memo may not have future entries (the case of a non-positive tolerance), @@ -650,13 +652,15 @@ class InputState { // timestamp, update latest_time and latest_ref_row to the value that immediately pass // the horizon. Update the memo-store with any entries or future entries so observed. // Returns true if updates were made, false if not. - Result AdvanceAndMemoize(OnType ts) { + // NOTE: The emptiness must be decided by a single call to Empty() in caller, due to the + // potential race with Push(), see GH-41614. + Result AdvanceAndMemoize(OnType ts, bool empty) { // Advance the right side row index until we reach the latest right row (for each key) // for the given left timestamp. DEBUG_SYNC(node_, "Advancing input ", index_, DEBUG_MANIP(std::endl)); // Check if already updated for TS (or if there is no latest) - if (Empty()) { // can't advance if empty and no future entries + if (empty) { // can't advance if empty and no future entries return memo_.no_future_ ? false : memo_.RemoveEntriesWithLesserTime(ts); } @@ -918,34 +922,46 @@ class CompositeTableBuilder { // guaranteeing this probability is below 1 in a billion. The fix is 128-bit hashing. // See ARROW-17653 class AsofJoinNode : public ExecNode { - // Advances the RHS as far as possible to be up to date for the current LHS timestamp - Result UpdateRhs() { + // A simple wrapper for the result of a single call to UpdateRhs(), identifying: + // 1) If any RHS has advanced. + // 2) If all RHS are up to date with LHS. + struct RhsUpdateState { + bool any_advanced; + bool all_up_to_date_with_lhs; + }; + // Advances the RHS as far as possible to be up to date for the current LHS timestamp, + // and checks if all RHS are up to date with LHS. The reason they have to be performed + // together is that they both depend on the emptiness of the RHS, which can be changed + // by Push() executing in another thread. + Result UpdateRhs() { auto& lhs = *state_.at(0); auto lhs_latest_time = lhs.GetLatestTime(); - bool any_updated = false; - for (size_t i = 1; i < state_.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(bool advanced, state_[i]->AdvanceAndMemoize(lhs_latest_time)); - any_updated |= advanced; - } - return any_updated; - } - - // Returns false if RHS not up to date for LHS - bool IsUpToDateWithLhsRow() const { - auto& lhs = *state_[0]; - if (lhs.Empty()) return false; // can't proceed if nothing on the LHS - OnType lhs_ts = lhs.GetLatestTime(); + RhsUpdateState update_state{/*any_advanced=*/false, /*all_up_to_date_with_lhs=*/true}; for (size_t i = 1; i < state_.size(); ++i) { auto& rhs = *state_[i]; - if (!rhs.Finished()) { + + // Obtain RHS emptiness once for subsequent AdvanceAndMemoize() and CurrentEmpty(). + bool rhs_empty = rhs.Empty(); + // Obtain RHS current time here because AdvanceAndMemoize() can change the + // emptiness. + OnType rhs_current_time = rhs_empty ? OnType{} : rhs.GetLatestTime(); + + ARROW_ASSIGN_OR_RAISE(bool advanced, + rhs.AdvanceAndMemoize(lhs_latest_time, rhs_empty)); + update_state.any_advanced |= advanced; + + if (update_state.all_up_to_date_with_lhs && !rhs.Finished()) { // If RHS is finished, then we know it's up to date - if (rhs.CurrentEmpty()) - return false; // RHS isn't finished, but is empty --> not up to date - if (lhs_ts > rhs.GetCurrentTime()) - return false; // RHS isn't up to date (and not finished) + if (rhs.CurrentEmpty(rhs_empty)) { + // RHS isn't finished, but is empty --> not up to date + update_state.all_up_to_date_with_lhs = false; + } else if (lhs_latest_time > rhs_current_time) { + // RHS isn't up to date (and not finished) + update_state.all_up_to_date_with_lhs = false; + } } } - return true; + return update_state; } Result> ProcessInner() { @@ -963,20 +979,19 @@ class AsofJoinNode : public ExecNode { // If LHS is finished or empty then there's nothing we can do here if (lhs.Finished() || lhs.Empty()) break; - // Advance each of the RHS as far as possible to be up to date for the LHS timestamp - ARROW_ASSIGN_OR_RAISE(bool any_rhs_advanced, UpdateRhs()); + ARROW_ASSIGN_OR_RAISE(auto rhs_update_state, UpdateRhs()); // If we have received enough inputs to produce the next output batch // (decided by IsUpToDateWithLhsRow), we will perform the join and // materialize the output batch. The join is done by advancing through // the LHS and adding joined row to rows_ (done by Emplace). Finally, // input batches that are no longer needed are removed to free up memory. - if (IsUpToDateWithLhsRow()) { + if (rhs_update_state.all_up_to_date_with_lhs) { dst.Emplace(state_, tolerance_); ARROW_ASSIGN_OR_RAISE(bool advanced, lhs.Advance()); if (!advanced) break; // if we can't advance LHS, we're done for this batch } else { - if (!any_rhs_advanced) break; // need to wait for new data + if (!rhs_update_state.any_advanced) break; // need to wait for new data } } diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index d95d2aaad3643..051e280a4c53c 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -1678,5 +1678,59 @@ TEST(AsofJoinTest, BackpressureWithBatchesGen) { /*slow_r0=*/false); } +// Reproduction of GH-40675: A logical race between Process() and Push() that can be more +// easily observed with single small batch. +TEST(AsofJoinTest, RhsEmptinessRace) { + auto left_batch = ExecBatchFromJSON( + {int64(), utf8()}, R"([[1, "a"], [1, "b"], [5, "a"], [6, "b"], [7, "f"]])"); + auto right_batch = ExecBatchFromJSON( + {int64(), utf8(), float64()}, R"([[2, "a", 1.0], [9, "b", 3.0], [15, "g", 5.0]])"); + + Declaration left{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colA", int64()), field("col2", utf8())}), + {std::move(left_batch)})}; + Declaration right{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colB", int64()), field("col3", utf8()), + field("colC", float64())}), + {std::move(right_batch)})}; + AsofJoinNodeOptions asof_join_opts({{{"colA"}, {{"col2"}}}, {{"colB"}, {{"col3"}}}}, 1); + Declaration asof_join{ + "asofjoin", {std::move(left), std::move(right)}, std::move(asof_join_opts)}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(asof_join))); + + auto exp_batch = ExecBatchFromJSON( + {int64(), utf8(), float64()}, + R"([[1, "a", 1.0], [1, "b", null], [5, "a", null], [6, "b", null], [7, "f", null]])"); + AssertExecBatchesEqualIgnoringOrder(result.schema, {exp_batch}, result.batches); +} + +// Reproduction of GH-41149: Another case of the same root cause as GH-40675, but with +// empty "by" columns. +TEST(AsofJoinTest, RhsEmptinessRaceEmptyBy) { + auto left_batch = ExecBatchFromJSON({int64()}, R"([[1], [2], [3]])"); + auto right_batch = + ExecBatchFromJSON({utf8(), int64()}, R"([["Z", 2], ["B", 3], ["A", 4]])"); + + Declaration left{"exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("on", int64())}), + {std::move(left_batch)})}; + Declaration right{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colVals", utf8()), field("on", int64())}), + {std::move(right_batch)})}; + AsofJoinNodeOptions asof_join_opts({{{"on"}, {}}, {{"on"}, {}}}, 1); + Declaration asof_join{ + "asofjoin", {std::move(left), std::move(right)}, std::move(asof_join_opts)}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(asof_join))); + + auto exp_batch = + ExecBatchFromJSON({int64(), utf8()}, R"([[1, "Z"], [2, "Z"], [3, "B"]])"); + AssertExecBatchesEqualIgnoringOrder(result.schema, {exp_batch}, result.batches); +} + } // namespace acero } // namespace arrow From 6c386dab6760961160ddbfe7dcb6952943920828 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 15 May 2024 01:10:22 +0800 Subject: [PATCH 090/105] GH-41334: [C++][Acero] Use per-node basis temp vector stack to mitigate overflow (#41335) ### Rationale for this change The risk of temp vector stack overflow still exists as described in #41334 . Many people have agreed on a per-node basis approach: > 1) it doesn't introduce more performance penalty than shared stack; 2) it can mitigate the overflow in a natural way, i.e., expanding the stack size linear to the number of nodes; 3) it requires no more complexity to the existing stack implementation. The full (but long) story is also revealed in the subsequent discussion of this PR. Feel free to scroll down. ### What changes are included in this PR? 1. Change the current shared (per-thread) temp vector stack usage to per-node basis. 2. Make the stack size required by each stack user more explicit. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: #41334 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- cpp/src/arrow/CMakeLists.txt | 3 +- cpp/src/arrow/acero/exec_plan.cc | 2 +- cpp/src/arrow/acero/hash_join_node.cc | 38 +++++++--- cpp/src/arrow/acero/hash_join_node_test.cc | 52 +++++++++++++ cpp/src/arrow/acero/query_context.cc | 12 +-- cpp/src/arrow/acero/query_context.h | 8 +- cpp/src/arrow/acero/swiss_join.cc | 16 ++-- cpp/src/arrow/compute/key_hash_internal.h | 19 +++++ cpp/src/arrow/compute/key_hash_test.cc | 59 ++++++++++++++- cpp/src/arrow/compute/key_map_internal.h | 1 + cpp/src/arrow/compute/light_array_internal.h | 1 + cpp/src/arrow/compute/light_array_test.cc | 1 + cpp/src/arrow/compute/row/compare_internal.h | 10 +++ cpp/src/arrow/compute/row/compare_test.cc | 62 ++++++++++++++- cpp/src/arrow/compute/row/grouper.cc | 1 + cpp/src/arrow/compute/util.cc | 31 -------- cpp/src/arrow/compute/util.h | 73 ------------------ cpp/src/arrow/compute/util_internal.cc | 79 ++++++++++++++++++++ cpp/src/arrow/compute/util_internal.h | 53 +++++++++++++ 19 files changed, 371 insertions(+), 150 deletions(-) create mode 100644 cpp/src/arrow/compute/util_internal.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 5d61112518f5e..0f4824ec99daa 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -716,7 +716,8 @@ set(ARROW_COMPUTE_SRCS compute/row/compare_internal.cc compute/row/grouper.cc compute/row/row_internal.cc - compute/util.cc) + compute/util.cc + compute/util_internal.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_internal_avx2.cc) append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_internal_avx2.cc) diff --git a/cpp/src/arrow/acero/exec_plan.cc b/cpp/src/arrow/acero/exec_plan.cc index 97119726d4b17..d9fb1942fccd8 100644 --- a/cpp/src/arrow/acero/exec_plan.cc +++ b/cpp/src/arrow/acero/exec_plan.cc @@ -128,7 +128,7 @@ struct ExecPlanImpl : public ExecPlan { Future<> scheduler_finished = arrow::util::AsyncTaskScheduler::Make( [this](arrow::util::AsyncTaskScheduler* async_scheduler) { QueryContext* ctx = query_context(); - RETURN_NOT_OK(ctx->Init(ctx->max_concurrency(), async_scheduler)); + RETURN_NOT_OK(ctx->Init(async_scheduler)); #ifdef ARROW_WITH_OPENTELEMETRY if (HasMetadata()) { diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index b49364300dac8..06405f16c8d4c 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -497,11 +497,11 @@ struct BloomFilterPushdownContext { using BuildFinishedCallback = std::function; using FiltersReceivedCallback = std::function; using FilterFinishedCallback = std::function; - void Init(HashJoinNode* owner, size_t num_threads, - RegisterTaskGroupCallback register_task_group_callback, - StartTaskGroupCallback start_task_group_callback, - FiltersReceivedCallback on_bloom_filters_received, bool disable_bloom_filter, - bool use_sync_execution); + Status Init(HashJoinNode* owner, size_t num_threads, + RegisterTaskGroupCallback register_task_group_callback, + StartTaskGroupCallback start_task_group_callback, + FiltersReceivedCallback on_bloom_filters_received, + bool disable_bloom_filter, bool use_sync_execution); Status StartProducing(size_t thread_index); @@ -559,8 +559,7 @@ struct BloomFilterPushdownContext { std::vector hashes(batch.length); std::vector bv(bit_vector_bytes); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* stack = &tld_[thread_index].stack; // Start with full selection for the current batch memset(selected.data(), 0xff, bit_vector_bytes); @@ -654,7 +653,17 @@ struct BloomFilterPushdownContext { FiltersReceivedCallback all_received_callback_; FilterFinishedCallback on_finished_; } eval_; + + static constexpr auto kTempStackUsage = + Hashing32::kHashBatchTempStackUsage + + (sizeof(uint32_t) + /*extra=*/1) * arrow::util::MiniBatch::kMiniBatchLength; + + struct ThreadLocalData { + arrow::util::TempVectorStack stack; + }; + std::vector tld_; }; + bool HashJoinSchema::HasDictionaries() const { for (int side = 0; side <= 1; ++side) { for (int icol = 0; icol < proj_maps[side].num_cols(HashJoinProjection::INPUT); @@ -930,7 +939,7 @@ class HashJoinNode : public ExecNode, public TracedNode { // we will change it back to just the CPU's thread pool capacity. size_t num_threads = (GetCpuThreadPoolCapacity() + io::GetIOThreadPoolCapacity() + 1); - pushdown_context_.Init( + RETURN_NOT_OK(pushdown_context_.Init( this, num_threads, [ctx](std::function fn, std::function on_finished) { @@ -940,7 +949,7 @@ class HashJoinNode : public ExecNode, public TracedNode { return ctx->StartTaskGroup(task_group_id, num_tasks); }, [this](size_t thread_index) { return OnFiltersReceived(thread_index); }, - disable_bloom_filter_, use_sync_execution); + disable_bloom_filter_, use_sync_execution)); RETURN_NOT_OK(impl_->Init( ctx, join_type_, num_threads, &(schema_mgr_->proj_maps[0]), @@ -1037,7 +1046,7 @@ class HashJoinNode : public ExecNode, public TracedNode { BloomFilterPushdownContext pushdown_context_; }; -void BloomFilterPushdownContext::Init( +Status BloomFilterPushdownContext::Init( HashJoinNode* owner, size_t num_threads, RegisterTaskGroupCallback register_task_group_callback, StartTaskGroupCallback start_task_group_callback, @@ -1074,6 +1083,12 @@ void BloomFilterPushdownContext::Init( return eval_.on_finished_(thread_index, std::move(eval_.batches_)); }); start_task_group_callback_ = std::move(start_task_group_callback); + tld_.resize(num_threads); + for (auto& local_data : tld_) { + RETURN_NOT_OK(local_data.stack.Init(ctx_->memory_pool(), kTempStackUsage)); + } + + return Status::OK(); } Status BloomFilterPushdownContext::StartProducing(size_t thread_index) { @@ -1124,8 +1139,7 @@ Status BloomFilterPushdownContext::BuildBloomFilter_exec_task(size_t thread_inde } ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(std::move(key_columns))); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* stack = &tld_[thread_index].stack; arrow::util::TempVectorHolder hash_holder( stack, arrow::util::MiniBatch::kMiniBatchLength); uint32_t* hashes = hash_holder.mutable_data(); diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 9c3dbc176ff4f..215b1e4d21125 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -28,6 +28,7 @@ #include "arrow/api.h" #include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" @@ -41,6 +42,7 @@ namespace arrow { using compute::call; using compute::default_exec_context; +using compute::ExecBatchBuilder; using compute::ExecSpan; using compute::field_ref; using compute::SortIndices; @@ -3201,5 +3203,55 @@ TEST(HashJoin, ChainedIntegerHashJoins) { } } +// Test that a large number of joins don't overflow the temp vector stack, like GH-39582 +// and GH-39951. +TEST(HashJoin, ManyJoins) { + // The idea of this case is to create many nested join nodes that may possibly cause + // recursive usage of temp vector stack. To make sure that the recursion happens: + // 1. A left-deep join tree is created so that the left-most (the final probe side) + // table will go through all the hash tables from the right side. + // 2. Left-outer join is used so that every join will increase the cardinality. + // 3. The left-most table contains rows of unique integers from 0 to N. + // 4. Each right table at level i contains two rows of integer i, so that the probing of + // each level will increase the result by one row. + // 5. The left-most table is a single batch of enough rows, so that at each level, the + // probing will accumulate enough result rows to have to output to the subsequent level + // before finishing the current batch (releasing the buffer allocated on the temp vector + // stack), which is essentially the recursive usage of the temp vector stack. + + // A fair number of joins to guarantee temp vector stack overflow before GH-41335. + const int num_joins = 64; + + // `ExecBatchBuilder::num_rows_max()` is the number of rows for swiss join to accumulate + // before outputting. + const int num_left_rows = ExecBatchBuilder::num_rows_max(); + ASSERT_OK_AND_ASSIGN( + auto left_batches, + MakeIntegerBatches({[](int row_id) -> int64_t { return row_id; }}, + schema({field("l_key", int32())}), + /*num_batches=*/1, /*batch_size=*/num_left_rows)); + Declaration root{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(left_batches.schema), + std::move(left_batches.batches))}; + + HashJoinNodeOptions join_opts(JoinType::LEFT_OUTER, /*left_keys=*/{"l_key"}, + /*right_keys=*/{"r_key"}); + + for (int i = 0; i < num_joins; ++i) { + ASSERT_OK_AND_ASSIGN(auto right_batches, + MakeIntegerBatches({[i](int) -> int64_t { return i; }}, + schema({field("r_key", int32())}), + /*num_batches=*/1, /*batch_size=*/2)); + Declaration table{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(right_batches.schema), + std::move(right_batches.batches))}; + + Declaration new_root{"hashjoin", {std::move(root), std::move(table)}, join_opts}; + root = std::move(new_root); + } + + ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root))); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/query_context.cc b/cpp/src/arrow/acero/query_context.cc index a27397d12079d..18beb19ab7f8b 100644 --- a/cpp/src/arrow/acero/query_context.cc +++ b/cpp/src/arrow/acero/query_context.cc @@ -40,8 +40,7 @@ QueryContext::QueryContext(QueryOptions opts, ExecContext exec_context) const CpuInfo* QueryContext::cpu_info() const { return CpuInfo::GetInstance(); } int64_t QueryContext::hardware_flags() const { return cpu_info()->hardware_flags(); } -Status QueryContext::Init(size_t max_num_threads, util::AsyncTaskScheduler* scheduler) { - tld_.resize(max_num_threads); +Status QueryContext::Init(util::AsyncTaskScheduler* scheduler) { async_scheduler_ = scheduler; return Status::OK(); } @@ -50,15 +49,6 @@ size_t QueryContext::GetThreadIndex() { return thread_indexer_(); } size_t QueryContext::max_concurrency() const { return thread_indexer_.Capacity(); } -Result QueryContext::GetTempStack(size_t thread_index) { - if (!tld_[thread_index].is_init) { - RETURN_NOT_OK(tld_[thread_index].stack.Init( - memory_pool(), 32 * util::MiniBatch::kMiniBatchLength * sizeof(uint64_t))); - tld_[thread_index].is_init = true; - } - return &tld_[thread_index].stack; -} - Result> QueryContext::BeginExternalTask(std::string_view name) { Future<> completion_future = Future<>::Make(); if (async_scheduler_->AddSimpleTask([completion_future] { return completion_future; }, diff --git a/cpp/src/arrow/acero/query_context.h b/cpp/src/arrow/acero/query_context.h index 9ea11679cba05..3eff299439828 100644 --- a/cpp/src/arrow/acero/query_context.h +++ b/cpp/src/arrow/acero/query_context.h @@ -38,7 +38,7 @@ class ARROW_ACERO_EXPORT QueryContext { QueryContext(QueryOptions opts = {}, ExecContext exec_context = *default_exec_context()); - Status Init(size_t max_num_threads, arrow::util::AsyncTaskScheduler* scheduler); + Status Init(arrow::util::AsyncTaskScheduler* scheduler); const ::arrow::internal::CpuInfo* cpu_info() const; int64_t hardware_flags() const; @@ -52,7 +52,6 @@ class ARROW_ACERO_EXPORT QueryContext { size_t GetThreadIndex(); size_t max_concurrency() const; - Result GetTempStack(size_t thread_index); /// \brief Start an external task /// @@ -145,11 +144,6 @@ class ARROW_ACERO_EXPORT QueryContext { std::unique_ptr task_scheduler_ = TaskScheduler::Make(); ThreadIndexer thread_indexer_; - struct ThreadLocalData { - bool is_init = false; - arrow::util::TempVectorStack stack; - }; - std::vector tld_; std::atomic in_flight_bytes_to_disk_{0}; }; diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 542e943c4a82b..17c5212697339 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2470,6 +2470,8 @@ Status JoinProbeProcessor::OnFinished() { class SwissJoin : public HashJoinImpl { public: + static constexpr auto kTempStackUsage = 64 * arrow::util::MiniBatch::kMiniBatchLength; + Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, const HashJoinProjectionMaps* proj_map_left, const HashJoinProjectionMaps* proj_map_right, @@ -2513,6 +2515,7 @@ class SwissJoin : public HashJoinImpl { local_states_.resize(num_threads_); for (int i = 0; i < num_threads_; ++i) { + RETURN_NOT_OK(local_states_[i].stack.Init(pool_, kTempStackUsage)); local_states_[i].hash_table_ready = false; local_states_[i].num_output_batches = 0; local_states_[i].materialize.Init(pool_, proj_map_left, proj_map_right); @@ -2566,8 +2569,7 @@ class SwissJoin : public HashJoinImpl { ExecBatch keypayload_batch; ARROW_ASSIGN_OR_RAISE(keypayload_batch, KeyPayloadFromInput(/*side=*/0, &batch)); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_index].stack; return CancelIfNotOK( probe_processor_.OnNextBatch(thread_index, keypayload_batch, temp_stack, @@ -2679,8 +2681,7 @@ class SwissJoin : public HashJoinImpl { input_batch.values[schema->num_cols(HashJoinProjection::KEY) + icol]; } } - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.PushNextBatch( static_cast(thread_id), key_batch, no_payload ? nullptr : &payload_batch, temp_stack))); @@ -2715,8 +2716,7 @@ class SwissJoin : public HashJoinImpl { Status MergeFinished(size_t thread_id) { RETURN_NOT_OK(status()); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; hash_table_build_.FinishPrtnMerge(temp_stack); return CancelIfNotOK(OnBuildHashTableFinished(static_cast(thread_id))); } @@ -2771,8 +2771,7 @@ class SwissJoin : public HashJoinImpl { std::min((task_id + 1) * kNumRowsPerScanTask, hash_table_.num_rows()); // Get thread index and related temp vector stack // - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; // Split into mini-batches // @@ -2949,6 +2948,7 @@ class SwissJoin : public HashJoinImpl { FinishedCallback finished_callback_; struct ThreadLocalState { + arrow::util::TempVectorStack stack; JoinResultMaterialize materialize; std::vector temp_column_arrays; int64_t num_output_batches; diff --git a/cpp/src/arrow/compute/key_hash_internal.h b/cpp/src/arrow/compute/key_hash_internal.h index 7d226f52086b1..1f25beb0e1622 100644 --- a/cpp/src/arrow/compute/key_hash_internal.h +++ b/cpp/src/arrow/compute/key_hash_internal.h @@ -48,6 +48,16 @@ class ARROW_EXPORT Hashing32 { static void HashMultiColumn(const std::vector& cols, LightContext* ctx, uint32_t* out_hash); + // Clarify the max temp stack usage for HashBatch, which might be necessary for the + // caller to be aware of at compile time to reserve enough stack size in advance. The + // HashBatch implementation uses one uint32 temp vector as a buffer for hash, one uint16 + // temp vector as a buffer for null indices and one uint32 temp vector as a buffer for + // null hash, all are of size kMiniBatchLength. Plus extra kMiniBatchLength to cope with + // stack padding and aligning. + static constexpr auto kHashBatchTempStackUsage = + (sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint32_t) + /*extra=*/1) * + util::MiniBatch::kMiniBatchLength; + static Status HashBatch(const ExecBatch& key_batch, uint32_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, @@ -161,6 +171,15 @@ class ARROW_EXPORT Hashing64 { static void HashMultiColumn(const std::vector& cols, LightContext* ctx, uint64_t* hashes); + // Clarify the max temp stack usage for HashBatch, which might be necessary for the + // caller to be aware of at compile time to reserve enough stack size in advance. The + // HashBatch implementation uses one uint16 temp vector as a buffer for null indices and + // one uint64 temp vector as a buffer for null hash, all are of size kMiniBatchLength. + // Plus extra kMiniBatchLength to cope with stack padding and aligning. + static constexpr auto kHashBatchTempStackUsage = + (sizeof(uint16_t) + sizeof(uint64_t) + /*extra=*/1) * + util::MiniBatch::kMiniBatchLength; + static Status HashBatch(const ExecBatch& key_batch, uint64_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index 4e5d869cb7db6..fdf6d2125850a 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -25,12 +25,16 @@ #include "arrow/array/builder_binary.h" #include "arrow/compute/key_hash_internal.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/util/cpu_info.h" #include "arrow/util/pcg_random.h" namespace arrow { +using arrow::random::RandomArrayGenerator; +using arrow::util::MiniBatch; +using arrow::util::TempVectorStack; using internal::checked_pointer_cast; using internal::CpuInfo; @@ -156,7 +160,7 @@ class TestVectorHash { std::vector temp_buffer; temp_buffer.resize(mini_batch_size * 4); - for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { const auto hardware_flags = hardware_flags_for_testing[i]; if (use_32bit_hash) { if (!use_varlen_input) { @@ -192,7 +196,7 @@ class TestVectorHash { // Verify that all implementations (scalar, SIMD) give the same hashes // const auto& hashes_scalar64 = hashes64[0]; - for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { for (int j = 0; j < num_rows; ++j) { ASSERT_EQ(hashes64[i][j], hashes_scalar64[j]) << "scalar and simd approaches yielded different hashes"; @@ -280,7 +284,7 @@ void HashFixedLengthFrom(int key_length, int num_rows, int start_row) { std::vector temp_buffer; temp_buffer.resize(mini_batch_size * 4); - for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { const auto hardware_flags = hardware_flags_for_testing[i]; Hashing32::HashFixed(hardware_flags, /*combine_hashes=*/false, num_rows_to_hash, key_length, @@ -292,7 +296,7 @@ void HashFixedLengthFrom(int key_length, int num_rows, int start_row) { } // Verify that all implementations (scalar, SIMD) give the same hashes. - for (int i = 1; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 1; i < hardware_flags_for_testing.size(); ++i) { for (int j = 0; j < num_rows_to_hash; ++j) { ASSERT_EQ(hashes32[i][j], hashes32[0][j]) << "scalar and simd approaches yielded different 32-bit hashes"; @@ -311,5 +315,52 @@ TEST(VectorHash, FixedLengthTailByteSafety) { HashFixedLengthFrom(/*key_length=*/19, /*num_rows=*/64, /*start_row=*/63); } +// Make sure that Hashing32/64::HashBatch uses no more stack space than declared in +// Hashing32/64::kHashBatchTempStackUsage. +TEST(VectorHash, HashBatchTempStackUsage) { + for (auto num_rows : + {0, 1, MiniBatch::kMiniBatchLength, MiniBatch::kMiniBatchLength * 64}) { + SCOPED_TRACE("num_rows = " + std::to_string(num_rows)); + + MemoryPool* pool = default_memory_pool(); + RandomArrayGenerator gen(42); + + auto column = gen.Int8(num_rows, 0, 127); + ExecBatch batch({column}, num_rows); + + std::vector column_arrays; + ASSERT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); + + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + + { + std::vector hashes(num_rows); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, Hashing32::kHashBatchTempStackUsage)); + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { + SCOPED_TRACE("hashing32 for hardware flags = " + + std::to_string(hardware_flags_for_testing[i])); + ASSERT_OK(Hashing32::HashBatch(batch, hashes.data(), column_arrays, + hardware_flags_for_testing[i], &stack, + /*start_rows=*/0, num_rows)); + } + } + + { + std::vector hashes(num_rows); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, Hashing64::kHashBatchTempStackUsage)); + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { + SCOPED_TRACE("hashing64 for hardware flags = " + + std::to_string(hardware_flags_for_testing[i])); + ASSERT_OK(Hashing64::HashBatch(batch, hashes.data(), column_arrays, + hardware_flags_for_testing[i], &stack, + /*start_rows=*/0, num_rows)); + } + } + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/key_map_internal.h b/cpp/src/arrow/compute/key_map_internal.h index 8e06dc83483aa..a5e784a9e4463 100644 --- a/cpp/src/arrow/compute/key_map_internal.h +++ b/cpp/src/arrow/compute/key_map_internal.h @@ -21,6 +21,7 @@ #include #include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h index 67de71bf56c92..995c4211998e0 100644 --- a/cpp/src/arrow/compute/light_array_internal.h +++ b/cpp/src/arrow/compute/light_array_internal.h @@ -22,6 +22,7 @@ #include "arrow/array.h" #include "arrow/compute/exec.h" #include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/type.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 08f36ee606025..cc02d489d138f 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/memory_pool.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index 16002ee5184e9..a5a109b0b516a 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -32,6 +32,16 @@ namespace compute { class ARROW_EXPORT KeyCompare { public: + // Clarify the max temp stack usage for CompareColumnsToRows, which might be necessary + // for the caller to be aware of (possibly at compile time) to reserve enough stack size + // in advance. The CompareColumnsToRows implementation uses three uint8 temp vectors as + // buffers for match vectors, all are of size num_rows. Plus extra kMiniBatchLength to + // cope with stack padding and aligning. + constexpr static int64_t CompareColumnsToRowsTempStackUsage(int64_t num_rows) { + return (sizeof(uint8_t) + sizeof(uint8_t) + sizeof(uint8_t)) * num_rows + + /*extra=*/util::MiniBatch::kMiniBatchLength; + } + // Returns a single 16-bit selection vector of rows that failed comparison. // If there is input selection on the left, the resulting selection is a filtered image // of input selection. diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 1d8562cd56d3c..4044049b10863 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -19,23 +19,26 @@ #include "arrow/compute/row/compare_internal.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" namespace arrow { namespace compute { using arrow::bit_util::BytesForBits; using arrow::internal::CpuInfo; +using arrow::random::RandomArrayGenerator; using arrow::util::MiniBatch; using arrow::util::TempVectorStack; // Specialized case for GH-39577. TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) { int fsb_length = 9; + int num_rows = 7; + MemoryPool* pool = default_memory_pool(); TempVectorStack stack; - ASSERT_OK(stack.Init(pool, 8 * MiniBatch::kMiniBatchLength * sizeof(uint64_t))); + ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows))); - int num_rows = 7; auto column_right = ArrayFromJSON(fixed_size_binary(fsb_length), R"([ "000000000", "111111111", @@ -106,5 +109,60 @@ TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) { } } +// Make sure that KeyCompare::CompareColumnsToRows uses no more stack space than declared +// in KeyCompare::CompareColumnsToRowsTempStackUsage(). +TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) { + for (auto num_rows : + {0, 1, MiniBatch::kMiniBatchLength, MiniBatch::kMiniBatchLength * 64}) { + SCOPED_TRACE("num_rows = " + std::to_string(num_rows)); + + MemoryPool* pool = default_memory_pool(); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows))); + + RandomArrayGenerator gen(42); + + auto column_right = gen.Int8(num_rows, 0, 127); + ExecBatch batch_right({column_right}, num_rows); + + std::vector column_metadatas_right; + ASSERT_OK(ColumnMetadatasFromExecBatch(batch_right, &column_metadatas_right)); + + RowTableMetadata table_metadata_right; + table_metadata_right.FromColumnMetadataVector(column_metadatas_right, + sizeof(uint64_t), sizeof(uint64_t)); + + std::vector column_arrays_right; + ASSERT_OK(ColumnArraysFromExecBatch(batch_right, &column_arrays_right)); + + RowTableImpl row_table; + ASSERT_OK(row_table.Init(pool, table_metadata_right)); + + RowTableEncoder row_encoder; + row_encoder.Init(column_metadatas_right, sizeof(uint64_t), sizeof(uint64_t)); + row_encoder.PrepareEncodeSelected(0, num_rows, column_arrays_right); + + std::vector row_ids_right(num_rows); + std::iota(row_ids_right.begin(), row_ids_right.end(), 0); + ASSERT_OK(row_encoder.EncodeSelected(&row_table, num_rows, row_ids_right.data())); + + auto column_left = gen.Int8(num_rows, 0, 127); + ExecBatch batch_left({column_left}, num_rows); + std::vector column_arrays_left; + ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &column_arrays_left)); + + std::vector row_ids_left(num_rows); + std::iota(row_ids_left.begin(), row_ids_left.end(), 0); + + LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack}; + + uint32_t num_rows_no_match; + std::vector row_ids_out(num_rows); + KeyCompare::CompareColumnsToRows(num_rows, NULLPTR, row_ids_left.data(), &ctx, + &num_rows_no_match, row_ids_out.data(), + column_arrays_left, row_table, true, NULLPTR); + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index 50ca20bd14f31..3ed5411d0ba02 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -600,6 +600,7 @@ struct GrouperFastImpl : public Grouper { } Status Reset() override { + ARROW_DCHECK_EQ(temp_stack_.AllocatedSize(), 0); rows_.Clean(); rows_minibatch_.Clean(); map_.cleanup(); diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index b0c863b26a062..b90b3a64056bd 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -17,11 +17,7 @@ #include "arrow/compute/util.h" -#include "arrow/table.h" -#include "arrow/util/bit_util.h" -#include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" -#include "arrow/util/tracing_internal.h" #include "arrow/util/ubsan.h" namespace arrow { @@ -31,33 +27,6 @@ using internal::CpuInfo; namespace util { -void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { - int64_t new_top = top_ + EstimatedAllocationSize(num_bytes); - // Stack overflow check (see GH-39582). - // XXX cannot return a regular Status because most consumers do not either. - ARROW_CHECK_LE(new_top, buffer_size_) << "TempVectorStack::alloc overflow"; - *data = buffer_->mutable_data() + top_ + sizeof(uint64_t); - // We set 8 bytes before the beginning of the allocated range and - // 8 bytes after the end to check for stack overflow (which would - // result in those known bytes being corrupted). - reinterpret_cast(buffer_->mutable_data() + top_)[0] = kGuard1; - reinterpret_cast(buffer_->mutable_data() + new_top)[-1] = kGuard2; - *id = num_vectors_++; - top_ = new_top; -} - -void TempVectorStack::release(int id, uint32_t num_bytes) { - ARROW_DCHECK(num_vectors_ == id + 1); - int64_t size = EstimatedAllocationSize(num_bytes); - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == - kGuard2); - ARROW_DCHECK(top_ >= size); - top_ -= size; - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == - kGuard1); - --num_vectors_; -} - namespace bit_util { inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 88dce160ce936..d56e398667f66 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -24,17 +24,10 @@ #include #include -#include "arrow/buffer.h" #include "arrow/compute/expression.h" #include "arrow/compute/type_fwd.h" -#include "arrow/memory_pool.h" #include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/util/bit_util.h" #include "arrow/util/cpu_info.h" -#include "arrow/util/mutex.h" -#include "arrow/util/thread_pool.h" -#include "arrow/util/type_fwd.h" #if defined(__clang__) || defined(__GNUC__) #define BYTESWAP(x) __builtin_bswap64(x) @@ -77,72 +70,6 @@ class MiniBatch { static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; }; -/// Storage used to allocate temporary vectors of a batch size. -/// Temporary vectors should resemble allocating temporary variables on the stack -/// but in the context of vectorized processing where we need to store a vector of -/// temporaries instead of a single value. -class ARROW_EXPORT TempVectorStack { - template - friend class TempVectorHolder; - - public: - Status Init(MemoryPool* pool, int64_t size) { - num_vectors_ = 0; - top_ = 0; - buffer_size_ = EstimatedAllocationSize(size); - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); - // Ensure later operations don't accidentally read uninitialized memory. - std::memset(buffer->mutable_data(), 0xFF, size); - buffer_ = std::move(buffer); - return Status::OK(); - } - - private: - static int64_t EstimatedAllocationSize(int64_t size) { - return PaddedAllocationSize(size) + 2 * sizeof(uint64_t); - } - - static int64_t PaddedAllocationSize(int64_t num_bytes) { - // Round up allocation size to multiple of 8 bytes - // to avoid returning temp vectors with unaligned address. - // - // Also add padding at the end to facilitate loads and stores - // using SIMD when number of vector elements is not divisible - // by the number of SIMD lanes. - // - return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; - } - void alloc(uint32_t num_bytes, uint8_t** data, int* id); - void release(int id, uint32_t num_bytes); - static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; - static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; - static constexpr int64_t kPadding = 64; - int num_vectors_; - int64_t top_; - std::unique_ptr buffer_; - int64_t buffer_size_; -}; - -template -class TempVectorHolder { - friend class TempVectorStack; - - public: - ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } - T* mutable_data() { return reinterpret_cast(data_); } - TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { - stack_ = stack; - num_elements_ = num_elements; - stack_->alloc(num_elements * sizeof(T), &data_, &id_); - } - - private: - TempVectorStack* stack_; - uint8_t* data_; - int id_; - uint32_t num_elements_; -}; - namespace bit_util { ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, diff --git a/cpp/src/arrow/compute/util_internal.cc b/cpp/src/arrow/compute/util_internal.cc new file mode 100644 index 0000000000000..cc26982fef110 --- /dev/null +++ b/cpp/src/arrow/compute/util_internal.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/util_internal.h" + +#include "arrow/compute/util.h" +#include "arrow/memory_pool.h" + +namespace arrow { +namespace util { + +Status TempVectorStack::Init(MemoryPool* pool, int64_t size) { + num_vectors_ = 0; + top_ = 0; + buffer_size_ = EstimatedAllocationSize(size); + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); + // Ensure later operations don't accidentally read uninitialized memory. + std::memset(buffer->mutable_data(), 0xFF, size); + buffer_ = std::move(buffer); + return Status::OK(); +} + +int64_t TempVectorStack::PaddedAllocationSize(int64_t num_bytes) { + // Round up allocation size to multiple of 8 bytes + // to avoid returning temp vectors with unaligned address. + // + // Also add padding at the end to facilitate loads and stores + // using SIMD when number of vector elements is not divisible + // by the number of SIMD lanes. + // + return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; +} + +void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { + int64_t estimated_alloc_size = EstimatedAllocationSize(num_bytes); + int64_t new_top = top_ + estimated_alloc_size; + // Stack overflow check (see GH-39582). + // XXX cannot return a regular Status because most consumers do not either. + ARROW_CHECK_LE(new_top, buffer_size_) + << "TempVectorStack::alloc overflow: allocating " << estimated_alloc_size + << " on top of " << top_ << " in stack of size " << buffer_size_; + *data = buffer_->mutable_data() + top_ + sizeof(uint64_t); + // We set 8 bytes before the beginning of the allocated range and + // 8 bytes after the end to check for stack overflow (which would + // result in those known bytes being corrupted). + reinterpret_cast(buffer_->mutable_data() + top_)[0] = kGuard1; + reinterpret_cast(buffer_->mutable_data() + new_top)[-1] = kGuard2; + *id = num_vectors_++; + top_ = new_top; +} + +void TempVectorStack::release(int id, uint32_t num_bytes) { + ARROW_DCHECK(num_vectors_ == id + 1); + int64_t size = EstimatedAllocationSize(num_bytes); + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == + kGuard2); + ARROW_DCHECK(top_ >= size); + top_ -= size; + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == + kGuard1); + --num_vectors_; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h index 87e89a3350721..043ff118062e4 100644 --- a/cpp/src/arrow/compute/util_internal.h +++ b/cpp/src/arrow/compute/util_internal.h @@ -17,6 +17,8 @@ #pragma once +#include "arrow/status.h" +#include "arrow/type_fwd.h" #include "arrow/util/logging.h" namespace arrow { @@ -27,5 +29,56 @@ void CheckAlignment(const void* ptr) { ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); } +/// Storage used to allocate temporary vectors of a batch size. +/// Temporary vectors should resemble allocating temporary variables on the stack +/// but in the context of vectorized processing where we need to store a vector of +/// temporaries instead of a single value. +class ARROW_EXPORT TempVectorStack { + template + friend class TempVectorHolder; + + public: + Status Init(MemoryPool* pool, int64_t size); + + int64_t AllocatedSize() const { return top_; } + + private: + static int64_t EstimatedAllocationSize(int64_t size) { + return PaddedAllocationSize(size) + 2 * sizeof(uint64_t); + } + + static int64_t PaddedAllocationSize(int64_t num_bytes); + + void alloc(uint32_t num_bytes, uint8_t** data, int* id); + void release(int id, uint32_t num_bytes); + static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; + static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; + static constexpr int64_t kPadding = 64; + int num_vectors_; + int64_t top_; + std::unique_ptr buffer_; + int64_t buffer_size_; +}; + +template +class TempVectorHolder { + friend class TempVectorStack; + + public: + ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } + T* mutable_data() { return reinterpret_cast(data_); } + TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { + stack_ = stack; + num_elements_ = num_elements; + stack_->alloc(num_elements * sizeof(T), &data_, &id_); + } + + private: + TempVectorStack* stack_; + uint8_t* data_; + int id_; + uint32_t num_elements_; +}; + } // namespace util } // namespace arrow From cc1e1d87c11666830385332eef9e2a5a102ba1b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 14 May 2024 20:17:06 +0200 Subject: [PATCH 091/105] MINOR: [Release] Update versions for 17.0.0-SNAPSHOT --- ci/scripts/PKGBUILD | 2 +- docs/source/_static/versions.json | 7 ++++++- r/DESCRIPTION | 2 +- r/NEWS.md | 4 +++- r/pkgdown/assets/versions.json | 4 ++-- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index b0905886dd50f..f6bbc78be710e 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=16.0.0.9000 +pkgver=16.1.0.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index f8ff19095b3fd..e879fc69138d0 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -5,11 +5,16 @@ "url": "https://arrow.apache.org/docs/dev/" }, { - "name": "16.0 (stable)", + "name": "16.1 (stable)", "version": "", "url": "https://arrow.apache.org/docs/", "preferred": true }, + { + "name": "16.0", + "version": "16.0/", + "url": "https://arrow.apache.org/docs/16.0/" + }, { "name": "15.0", "version": "15.0/", diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 38cbaa94a3c25..bb4470e29037d 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 16.0.0.9000 +Version: 16.1.0.9000 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index 05f934dac68f3..47c4ac1571dad 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,9 @@ under the License. --> -# arrow 16.0.0.9000 +# arrow 16.1.0.9000 + +# arrow 16.1.0 * R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are automatic translations rather than true user-defined functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 75d179f240515..43f0b3fac62a1 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,10 +1,10 @@ [ { - "name": "16.0.0.9000 (dev)", + "name": "16.1.0.9000 (dev)", "version": "dev/" }, { - "name": "16.0.0 (release)", + "name": "16.1.0 (release)", "version": "" }, { From bd89c4298612d37bb752f7823356c489b1e79162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 14 May 2024 20:17:07 +0200 Subject: [PATCH 092/105] MINOR: [Release] Update .deb/.rpm changelogs for 16.1.0 --- .../linux-packages/apache-arrow-apt-source/debian/changelog | 6 ++++++ .../apache-arrow-release/yum/apache-arrow-release.spec.in | 3 +++ dev/tasks/linux-packages/apache-arrow/debian/changelog | 6 ++++++ dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 3 +++ 4 files changed, 18 insertions(+) diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index 60e745301d9db..04aa586dc3c96 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (16.1.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Thu, 09 May 2024 07:21:29 -0000 + apache-arrow-apt-source (16.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 676c9e0d16dea..f0eb785dd6bc7 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -102,6 +102,9 @@ else fi %changelog +* Thu May 09 2024 Raúl Cumplido - 16.1.0-1 +- New upstream release. + * Tue Apr 16 2024 Raúl Cumplido - 16.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index e255e84096e4e..35cc598fe6f87 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (16.1.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Thu, 09 May 2024 07:21:29 -0000 + apache-arrow (16.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 3ede1814b865d..c6148e9260586 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -881,6 +881,9 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Thu May 09 2024 Raúl Cumplido - 16.1.0-1 +- New upstream release. + * Tue Apr 16 2024 Raúl Cumplido - 16.0.0-1 - New upstream release. From e411e0e211206dd7040668dac08ae935e8037aa9 Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Tue, 14 May 2024 12:52:59 -0700 Subject: [PATCH 093/105] GH-41602: [C#] Resolve build warnings (#41645) ### What changes are included in this PR? Adds annotations or suppressions to disable build warnings. Configures projects to produce an error on warnings. ### Are these changes tested? Changes are covered by existing tests. Closes #41602 * GitHub Issue: #41602 Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- csharp/Directory.Build.props | 4 ++- csharp/feather.png | Bin 0 -> 40042 bytes csharp/src/Apache.Arrow/Arrays/BinaryArray.cs | 4 +-- .../Apache.Arrow/Arrays/Decimal256Array.cs | 14 ++++---- .../Memory/NativeMemoryManager.cs | 2 ++ .../FlightSqlServerTests.cs | 25 ++++++++------- .../Apache.Arrow.Flight.Tests/FlightTests.cs | 30 +++++++++--------- .../Apache.Arrow.Tests/ArrowArrayTests.cs | 6 ++-- .../Apache.Arrow.Tests/DurationArrayTests.cs | 2 +- 9 files changed, 47 insertions(+), 40 deletions(-) create mode 100644 csharp/feather.png diff --git a/csharp/Directory.Build.props b/csharp/Directory.Build.props index f6d42241f95cf..3c06d3cd31d90 100644 --- a/csharp/Directory.Build.props +++ b/csharp/Directory.Build.props @@ -37,12 +37,13 @@ latest true $(CSharpDir)ApacheArrow.snk + true The Apache Software Foundation - https://www.apache.org/images/feather.png + feather.png LICENSE.txt https://arrow.apache.org/ @@ -55,6 +56,7 @@ + diff --git a/csharp/feather.png b/csharp/feather.png new file mode 100644 index 0000000000000000000000000000000000000000..7b596e6683ddbb083f672c3b5d9270a9a3035ec0 GIT binary patch literal 40042 zcmaHRQ+Q_Wl6Gv{Hs9E`ZR3q?cG$7)bdrucw(WFm+qOFS``fek%pCmJtfP8r)m?Y- zWUVJsSy2iR4i63l1O!n=T3i(b1k~)W><9z(S5kCJZ2fz{b(7F`Q**R%^E7b*f{2(q zngU5=>`g3zsz4KSZ|5-}KL`kTv$eXmo3?^HubHDglgU3eOkVa*f6*Wy{6bz%CT6xk zHxg5zrL}_q*>y(`8Hu&I0GSqt0zkn@3}|I7?c)Me^HEec^RYGKF((reB;ohs{p-LU z=w?FVWpC%;%IhUS_AkG@f8~GF%w#10GI6sNAp5VNv=x*|#2j6KBpgftMl%2>3kfGX z6AL?lhnR50C+jLcmXUV|9;5+vbmUB@T!VS{+sLXNr24C&CQ9I znc36RlgX2f$*T1Y?-Bf}96UP77 z+Ev}#3COGpbaixhG5gyO3-bSf|JLsRCi;(|x!Hf)Ik~&o{VPv%GiIP2&>raE=K41% zmVZU`H*#KaM>F?-v@+rXWPb^m%&pCN+1XfGI5=6^c({02Sy&`kL_|5+MYzSqSy=#_ zTz}vF!}AYT#=+Ii#K8>sAFTENU|Ii9EU%ag(8SHrMcvWS?mwNDv9oh`b#wVgO(Leo z!Nl@U1OAIg9Oz>00W_C%akMA-R|R>k{};2n`#-dtCMEzjP98>XASZwkV8YJD$Zcw2 z!N~nL7c&4iy9I|CknG=J^Z%9dzrbu}JOB%Gb0DLcDZrGG6Tr^KXaWFmGXgl+xJ=p2 zElezcJY>v&!!bAGb+dM}1O9usMC{!DE!kQB(_mgZ69>z`Wn?r5T9~-oxseH)xLO$5 zIa)fBh$-mEk?4szI(PtG+<@i={LKH%lAZNm{J*6Ca>e|=2>zx1U&Hqw%fA!-kNDrK z^!M=J%LsJ%yMZo$SFIN2s1F1rLsmvyMBQs`LBn63Km+Bg>wA~u)y@ldUy&3Kgg8nH za-=Mz?I50*JtJup7R)1ZQJHmkSKKF@+pQUJ`;XTr~8$B3!Xv9<2C-n&@}s7Mm)74FgsQqJ-AUAN(IOJ7Urxtv~s_KUyXB zJdcm5Gw(Qc4YyUr&*nW~@cO_*(m@$-lsgC`?wogMgL6d&yf{2Ko86383!4I0e25LtZ?ZH)3dhc@p!=M&vkCC)eq z$yd``MY>#%6OV0b<9hDxd+ydM87V7Y&*_ zgwDQ$aJ>>{6k&{d=$Rm^?s`}d%s1lM+i}%Ha@?UHQXYfbL4;!p`I4&txaWe4sWo+1yJn^g11Gu@?5^pChuK! z^5;*^z@8Vtjh-sw+Bu#T6TDrT;>BXn_t+_@_C-0(y$+*l`-gh~TDS6KUz>e{FfQiK z8;&{>(nHB19;-*s;%BLFweGweIXbxX!kdL5tm|*bZ&xEIH>T*iV*RL10OU$40!a%f z|2GYsoeOD<>_xd?XHDE_t7&c<%j_DTY8`*-W>?)tE4ACkDtMb7QCI%LzbyXP%q6k3aQJ6Ts6VN5L`W{12bC>|X6)7lwZ1}$f4SX0@Reb*x(9wV>1K_S?$&}fc)}Bu^j4qtl z3Jyf> zvDx*w>Q4FHN*Si?c|w5anCDpVc;nc6>bmA-TZJe{>DuT4`wRO|s?v&1=Z4Is5%w#Y zz|N}vVbjX>W6zxcxEd=G2jx@(ckE-E&%r|r?I&Y+svHZyG>cl)e5sl>K)2b(th~d8 z+y}aS7M%}1GIn3%J3P4x>^|hW+V4pULuuWD1Jzo6n;u(y<&6Y_xS>JmeJd(845phM zRd^=~OB7V99dic2jQd6dAnvRc3T$8}8PlY*aT8H*Gq2C=7tf4O-058aLg0WDM ziu=43hTpRTzr&p*s=j-OB<%Mj;tzB`GDTSKm9<&IyuQBm?j{S^I%Q<~D@nEGAp3C#qO2U{1a}?6?#;~NqjudY1#r}fhamxY( z^DaT+vqVZn1h*N*KFy-(lY-v*g%DOf`rjzJ(K^slu;zZP&>5g~RFk>Oh+B$~=F&;= z+X)B%uvE1U89LNUP*9xs<6YE4E~iEDHPUY21e5w00- znM%3RXo4q?8vY!gC-sM|vbN}6UvDI{cmHd2SKhj5d59}gD|%=B3-i{VbKy-?8(rLK z5JW=U3A6!R1`@sy#0{N|Ph;rC^xF<^-jrXSnig1N=4?)t7vlpVlN>)z72z|B3BZ;^ z`Pw~r47lTeixEfQhmo_79Q>}gnlpyrpk}UAHFA3B886Z@0n_b`G;B|Bw6-U11s6<+ znFt^sxr)(+j(mfVjQl}|A2cX|aJoD?b{>gbpr#fK*FVQp42o@`hl*uc?a#JdF|bT* z9};}m|9t3u@y&-rQ)qV1Ne+z)-D_jwmCN&WF!!z7WiampHUsl^LLEVI&hHw_K=Myn zZW2=@&S~FJi?ro}!UC0YlXnJk;kK0+?&*+7E@ZP6ipeCooF2$2NOY7b{g$8;)+5LH zjA*KdL>_F50kfl|U7}O#ueZ^*lAv@%uY-iZVHdQ9ez3boyoFEWu%leC$sP`*#XMju zH(xk9=*Ky*-b6Q%=W)r?mfqI}o_?GY~T}IFgXKf-{3SEI)+(|T(^J!5!Y^2jG^4Zns9svZ^@}UM-T#W; zI)oD5=1|J1icr+SzP6@-OI|o9{_VAp!EcLXt`f=+sSPj-R?f+@{dGY%Qh9f&Pps#L zh{Pd`rar(^6pfLDm{ed={R+d0K!juxfuI%>%Ci0WK&PN1dav~-o$VDF)7q%l-_Y@! zJMi$Ld)a7w0^jIct+m!gt$JrZm!3AeN82tt*hWtis<%h^CWI{vwCD(nro1??7Hj@H zx3)Sau!K{d#gzL*?hy~ehD+q9f0v#XiohZ$sca|;ZGYPf`K}P+-67ZE!2!kha4Bg4 zGyg@I>1bz2$(QpFOs8<5b1la(Z95W~K@54AlwK2wGo%@%aEBW+l-NBO6^1;y3bZag*e{6P>fzF3-u zSehptmY80xL=-ELpGa<2xd4V)FXmeG$`p+82NW<(YI8b?d|U-DL0=qTSLh7&F)avh zXpjYo;o#jpbsY^GBneAx zmpDtmx*mk&eoHd)naMwe2}D3STW&~?g@fxm3HO*~aDwd&MY!zue2J3pke;z%GT zi{ZUwfAwV_$375CrVEht$&b}(y3q;E%^ z1fUjS_NIjCA#y{DIEbRo`h@$&C(yvxW&7%8-miaxN>fVg&b6b8K%YBvOKZ0WZ}OuQ zR_`~3j`3%(H0tt?V;z(tOI&?YO|oq`(TJ?j0yN>SAlhs=gT8JN!C&Ee!&!8OF-8*W zr{&VO1<;B+z76@ON%sn(NtS|c$AQnlczut{)r56Ef*}e8m=c2~Ll6T6lGScSkcq$g zc!JNxmik|2)q)8|%rTm34sqWsnU9KI78){eIIwpyT8p~vy~K(7CQ#seseD}=Pi1ZC zht$dOUi+O}$%!S|A+P(E3}8ylSztqDenprO;eu{eAAz8`q8q!&Ho{Hb$PETa z@$EQ`YB!bO!NY&H^7w9&7%4mraToJshBVHI_tbMU2ru)|>q!Yw#=r5kCS$9)f9q04`eqi41-+v zN~o|Cd%YL?w4sO!sqKEZbh9-C`4re*(_*d`rI$`gO>| z*cFc;4AxsI#jK7Cbb6T6)W8G_3;VKfqJz9n)JC!zEgWUh_b3A3J}~8>J9p7FZMu)^ zSnAR{g<#%ew|l)Bg1N@btHYNP_&%2kMo)pq1sKws{tdyo?;*wd@+9-&+rSy&W&3D&Ai#rVXaqB51^-0E8@t(|r_y~)ai*y6WPrsl z2-nR7>3z>;qTsL4YLc5RNExOgLd}Nzbzztd3%{kv(xa>r0Hp&v`|>tm)@l?j9-p!h zji#b@-S>OKkD$KE)9ZHfMqE+>h33|EU+(AlcUN9sHgRrq)FurJu+13Bfq2*!)uYJX zjWu0{+tLBN-S80EHNls}jSExKq4TsNFV799C4Au#&hOAa(CLz%Vha!Z8(?eodu#rF zj3CNLLLi1T9+Gr5bd{4EL$r@K7_l*S&-c*GIED6{wQOdUU*?5gXI8uva8k-Pf6%R! zk&#(dEWHnS4! z5y-&0QjS%kHBWwp)}W=aYiR`1mmGcZivRgt%F9=AN+}_MBmoJj)$!m;wZlG>j7hJ_ zlmLym<@YAcKdj&n3!m?o5sm0Ai$nJm1nwx($$`ln?OQOm6->k#SIlf3C=8j)zWhu1 zK2#_%uvaZ=+0`XM+t62w1F7R^S8453maE(L@kU`#cId`YK*QOv)1lZ)#VH6@ zg`}8uOFe%`Fx5^KBLk081CafignT4!!5sSLww-P=k^-kgwia;fW8ycL8o13dDwKM| z<9c_1eb8aTG+}IetgI`Mn<}B-3l$Yp`$|?Y4UmFffR@*_m(y=ME5`fj>et>ZiAL?* zJgXpPJ`YsTyo^b=DbPVeX=D7XmyTV>9y(rMI`^RitSz_jG?)wu8v*AFfhph%GiAyl z;^Jz7_NW7MM`D;BMpIH)LD*sUCxc2gMLvBP8&1*1v@jB2nPtben2shw*@XmV<+Jk?L3Mj3pp-kCHI4aN1GwKE_;F|J2oG*HOn!9< z8QEU@jocR*7qY1m+C}LC-kV{FzQeFkG*&w#VCg~_emf$+;Ubk2`f^0;v8)ZN}vDK<3sDYJ$?}7O1;E$EcqrQP3^^Kt)+x>Q`c|uw0bMwh+ zu=hqzcJOPr<%u+g_{}n zDZ}>W!Cgw408~tfmUAJ+)GZVn%RiUEyQgBaT+RdAG@}pKJ2C$9 zgMPp-mSsuqS_y_eQx&W<54s+U0@1EJ;nhyK;B&d9K3S{W! zeFzt1k?7;{)-Od}Rlx={t`^nBH3h-1cgR0|edA0wsO7+QG~rM7~%)Kaup_IGXxTBP`JF zMeE#KKJ`&7whBoDscXlEYtL)KdNS$30`CgL6_jY-0B6WAo*v@*5v(Yyar=IAzOPN% z`7$R4Uq9E4%@$cLv6ouEN!?VO5=f{TOW%+}#< zPq|wDag15cS7MnO=Xz>&6KG<6t~Ba?>sT@FyNlY7;`n+6pMJBSl@aUQ za6n9`nB12~3b2^EB4JKaJ{$gQa6B_%U`VN6$ZQ$?|H*i z7K?}GL1Z}~pJ8|h&XSGX-#s+*Xtl1mOI-rxi;-^!$-wf6jumuTEh=AhJY8K~Ngbk* z`VrLFK=VcfM+7f?>2M>ec(ehv=TlE9qm#AJ@_lbnnpfN(GQ0s}A%y50;#lqNhxKHshYvm`{&B2;Q{4Vb* zlB^SOTix5vSX6g;&d#8X(pRxCf`o1MU~JUc{)w@-6lVtw4ijLmMJ>^J-X0wXCal#f zCMv3r9bd+mooN_iY{`*@mCCBTcn_1Iz`yr&U>dd)g6tnv%UVdCMrDy~eHQP&C(3{d zHba-lGEus$T)Bj@WK7OfCQ~BLR_9nz{sSxn{Yf?^At9*or%A8{gsuc@Yqrfw;7r$0 zqe}TQjvB5jEO90?S-&^j8Gg@@wnRWql9h2iSK)XL^qXd=WXEpW*R(adD2uxtjNb;q z&2!Oe2B%yHGuw!7f&&7 zGOT2*ZPJI(6GFHa4lp%--HD9ZI*Ivl8}|aGx$e6sw$aO4$*ee0bpp@0wKG>Ygg40G zo7)O=KUA>7bg(a>LV`DHv~&lyzTm%6ObpInzDxfL>k-(WwkeDGK|zFL%wXmdWh=`} zWjfY)uuIz!_BgH4T6dk~YtZ}V89tW?=6v`hhW}&oGXy({jd~M%6B0;mh2B`^@L3g}Z}<#- zd<7ttl{#x@$1A}S6F!tLCNk@|^eEGS*dKum!KcU+Y_G7l)CVD}tP0uo8q}tZWwk z$-zo7LE7}p!!A?EJXGYO&^+X`Gl9MMR|w+v%;fAYuxe1J%u&V^aurj*hYdxX$|uLI zh#?nwbRps%Udvix8XUbEY~#DfNa;(FY|{}2_|B({FQZJn90w=|r|?F%NM;=S^X|pM zkiXEIQgit{Q$Ek&3Yorb+zFn5;w!6Y=GEn`l11H`HvLHQ#AyC6rJAJqTW*~hz<-a1%*f&jEwM; z1fh<^VQdJ=`F`7vBAR+`g!^t{2WWe+L*}1O6X2sR#b=w!ubB?QCz`>G>CkEIZGwe~xt9A2-nWt@y zHDeFVHx5Q&YFhm7_3vLB;S5F{GCt;K^WL)Sa-w`Arf(_bU84dl1zP=S>m0CZwr%%O z)!$WC;!A{ZdTHY6gW9Y&^*qKFd|rNx^yVQ9UOSqELk{z@sXkv5x10@ZMz>TOeq$hK zBj48GVj;7bpitqg`ss>IPY%0;xro``+S8jBqf*!XwwB?fVy?%XGDRAgkL3x+5qPjYw zHR2&Q>N0-i2$DrsW_FS9H+P=6%+#n(MmfdSvq)CioB&5L~2szx7Xm6J_%j7fiwQaN3(NFiUpS!c1m5 z2?d;#x8&_QxuDJ#w)zPZFO+*=Y#Oc%NBQVQS9lZe?4vmGA$Z2lfI`<5!~ zppzJsg%JJw$Y?C8!{K)pv*S%pkG9>?&_LKW@vr4`NCU5rRkpl4Y8R(-3^>vJ13LE28Li8})!Q;Zr5DYL6<*>Acsg@t%tFGuX%UAgJ2+w!>vmoGS3fTq&ptcB~n@ZpMmSNA=#IX^+bIP{*K+( zU>=C)!l;_`9}&@FN1GN;odjH<2IBD(#+GwF#e&2WXho!6EW*AvJhj+gZcq);YyBg7 zBjvJP>7lpSN+my!^N5$=n)pCQ8Rb1YzOh~PJhePOSJ-$nwv_$a-z9!#zD`aF41q@4 zQ1^wfk*!}uTcKQ1gK@tuA%c!TB-nvUZ?q6*2+M_4SIP`)rGf4E>^dYFZhKbY;WAjC zb%KHa&W(Fgh;ZTQM5Kl&pz$M_Jz)78f+L3{U8kNV4I2VkyEM@UZ*C6=8`4O|ty{db zj!OgL6<*33e%&`mq%|#8#i{mhm;}QMZ)>?4Pm-$q_jevQVijF8POhZHJr77CK!s_zI>UJkX;!QO=V-%&~n}E1;(sQnGqv1)A)zXR)zp z;}`0xbUPi9^;K%I0bRonQ4BpL%T-s&))G%o<7mbUf6!%;amsmnsltUYx1`heYGSCs z`}WB9g%)i1rysu$s35bc7E8*zhgFO-XOfSpxTNVZdAPMhi~16>2gzGMF0*b`@2}z{ zM^^%TeR+%LNX86(xmM=C)W(UtyXQRe?iZg34GDyR!)LLTY2}eh(6K}ToQ&SrrX9sk zW_iAG0^v&zMFHZfqLQQ5c^}P2GkHVnt>GirD6O{}_Q1Fsu$MX0Yq>2}J=#k1jeCk|L!JR5wQBieBGQb+B#?|5L(FLAw8u$e>F ziv6NkdxetkSWtJ*c)%Xn|KKvPBJ9*jVDwul?}=@v(DZn+2WVBPIUHy7L6EKmhI)Ks z)o@;>QPmH5o;DQUX=B=$&{ka7Nm+O0trh2w6DbmH#lH15ve;HvUM567NSl(3#_Gb- z($=P-3qNkLU#FYO=es_vkBX`>7;!VPZvov_tcQ#Zlu6O@4axCNw%m6M9aAAy0GZm) zWd}QgSg1WV?x|pFWIY zsDkYFqdosLS9(V3rW5umfgFshBq@gF}Y&s7(|elaz0EFW%WP}9 zvGSZc^PL!;{Suznrg4;b3W~NN7iC9V?wGC5__E&V=32A(t@?81-95$GC^RYB_OpKT z$f+sfmg9oqQex!eBr$t@ToA$)v$YAnpbj)2Ibn)O8@pfPjR)n@9y+ka)0W?pPd2`3 zHar(us`7@d%o*U&LxJSQ*s>9QW=ZrrNtSwL7pkQ$Ey#4E!xx-V8k?Fz9G@^b=nft<831cR<#sc6| z(Fcl1hhV9uqN^68KO!Q%3_e1>=c?Foi5O`N7e7)BS5cpv0>`TCQlY#-)wMc_WnTQ3 zl5x&vlbQ1+)G1R{B6f)cm@M&Xgd(Ii_ZJg)Qhc==uJ05cIYT?#O#+pAfM+foTUW*) zSNmLP;9TOw{l#0)CB#`eruySHIs>q7p4Uv49V!$y-@9-&DFxy;k`$uSsr`|7Q(nc?wSplC07z19)qbRm}WhEW^?9Pp3fO4Gv(^^57))Z+FZ z*Q8&fgMa;ct3{esfiB1;0umuj@^~ZZ8REfggK%NK07moKjN~|p#uvj(ShVmu z*TF`=rsVYP-`1yfI}d(K?dOpg6Z2?AhJqLY6e4Mj6)BdzjIFGwrIBEt?5*`x=ebsD zss>1A^RLkX$EWZ&oxZ_v7QFe4hW#WRh6VI45gr_%hOkb>CJmLvY0NQlBNIOxa8Lnf zF5${&D&v3dVTvf_^8vN^~2uI4PruhtvD(U!f=ZqaFXV2-Ut`qdi{Iok;-|?g`+_e}thTd*? zn-?PKm*sM>l4;_Jke~HkjnLZ*7P-~-CYkL3HnpZo?nKSRL3 zqy~r)PoEPzVHEUKhrr#5)*+d!P2`T1JszlRL7X_smuFmT=@F7P^TE$1&k~fw2IQY@ zMQ^7AI|4B{bz4utR(jQ+8_K6*C859sGMS*4@+FN(rBwjHXgHZy^Kt^buWvbPtu(Rt z$y-u{*r;YKW)UZqF3yX%(K0udoA9;Xm(qC$Tjth-aes~hwibR3$6ueR=D&0Rq06WT zKuBdrhrvX2Ozvd8kR%uM&yT()Y`vbUKtDv|k&QAU%0eZjO@X^)Tz^TKScFMVRQevi z*fwx8k~;sARg=KryUiY!`yISoqS>Ts(Ug-xuSUPm;xl7eUJ#}H|C2A-XfMJ zBxKB$g*aFwCP?f2G$Me#F zyaNgec6@!UcOay~mk0q37Gx~OG2>t5)nggssX`*BP{golm9jf;?ZkjygSbiR8c{l< z3^mjftbuuA{r2yJB+K*|8-jtS^g5!1hDO{tkp}DK%ZBSoVM4#JkRPGV596;12JcF* z)DkAPl7l)2tt9nJ11%oQE@Y>MGCT2J=5)z^`FLCvn~jA>hqaZB+9Mw56_7H*dmql0 z_GCQu4$i7DqqJgI^TfPvzl~ki*0h~FLsLCkKEwp&g9J2Z$jm)6j$c#XbLRsKZ{a+8#w9t$q>{YP8i?fJ%N^Q5AWu!{o}I4 z4G)x97Mo=r_m_E1VybGPfN!xZ!U8QhIG^G1=x_kOj|=Ys*f$L`9x@2W!>XX?&Viju z+e)TxSx0qao+xZ>DU7TiAy4T=C^B2fH!UPcdFG6!0|f+@w?%)zHvGd!(KHhqB`qo6 z_=YTwyG0RA5eya{b>cY8jW^V}Qq(hnC|A>mwv`E?T`C2sj^Wf{ zdfzBgIt3uZ->nVO2Qa17fmxLa1Vb19pwdntEQ)(^4}~tiG`8$+A{gW^Pu4xg+(oeP zLQ>%a1$`;&0#RBKSmZpx#ZQRe9q)H#QYLoB*n6GcZ~eZ1{goXngFHkhmNf|3U)V!r zI;Yo6($&HPuM~gEWTX|-f@MUlgj#>|2<{@VbklhJWIUIE(+z`4Imn}@8WJz4gns2k zvfYc}H3X;DqG|=|+;SnBS2Bu&o3hRPnY7%hl2|jdCklN8G_i8Rhe6meQ=Pxn=z-Aj zX%I4TUgKe?NTxjHQ2@_(yEzI>FMeIxwcwv;j++0WQEqwr(F4W=o}S9FW;#hYSy2T{O=Oz5t_tqDh zRA87WzsX6l?7E~Ytm!CFgVq%Rn?-M1t-9I`DNACq4r`oH&pLWd-=eHvzMz4wsoXBf z<@Rigw0`SuKqAJM29Ak{%m0eV&fqHn;UreaD+x3&jUqYaJYdtA;5q(c)HQ{WM|Wzz zIU)EZ9J83d()L1*wr|J)@gO^MP$f;RC=;W_=*)X?-(Hhm%e1%#DK10ru+nD#9xSTX z>5p+Jl+WHP_LGT($GtuQM#bYwumjAai&i;%Bsj=I)2k(1_vixQtGXDaOs(B_8y+kgH3UdiT_M3a6H#(LUBtwkQA%L;_p+ze<@m%f9N6VnfP0qH-eBaU?MLS#!sud;EXh|+rW|EiAbStg z#vto+k9cHsETOiJAx3II9s5B55`M?CvkzrkwTfzGao1)3pXTs@#gnVPX(Aq@*f~l3 z;m7Em-ome!>v!gzz4T(7J@h~bBVwd>XG&@oAMJ%F(9Jb|8R?JpJot_^EZ#>~a^ASY z_dEceQ0DA%sgK<@QiFcr*2|yQ-%oJ^o?GKlj@62rR5WEfygQwlg(Z0H^xSuZvflc( zd_L?v_8p06@AUK(z#;qf=A{i4kAtPV23|?EcVEKy`-!X*7^cE=NS<>1T-)aE!&qK} zc^_Ft*0aEpw1%dy?IH`?@n1_WS}9-XfnDc;@rAlQmd(_w3d}!IRHX~PD?^II{J85C ztrMh=!i4uwQ5LWJ0rvI+pBA1{YC04a3=eS^mU ze3{hN`5)-JkA&DT9t4p~n%&$aF5OR^e-U+cpEh@_|CG?Y9L>HKx>|m7ozK|jqb#Y)U02nw+hmcUK~tIf{B5O`DXxgofRoe=z`lX6T=b^v`DvcnvY)W8Wc3MF-xYAWLdo z7xg+5eo95fHwL|MqtM4(fU`PgbJ@+b>Q{qPua3}``i5% ziMokG>$^R2f83m!4u;`4<~^=m3^iUxO~NMCI1I=yTkuz%_lfXNyMjDhvFe>wvS@zO znjHDF<>g(SK{Jh=UClh4oQGsHrX}bpf3t@-Q1IP=?DuL>(i%_>3Y1>962F_d;%XM^ zriUiNpB@vx59TYD>*Hmgv5-cQcdYwclQ|tGu-wyB(&sv8zi1E5`|;JjvPbl*C$>+? zh)JZV$>g%d$d5gA&X+lJ6{Yb)5A>R;LJ&jTom1qIzJYf*78Bf(bxtwN}MHMaNdFJM1>CmV9BG17<-)@U6~`wnsjPX;}#0qbg5wr>dsh!q~wgKJ_Nb z@UtK;>}Q{56>6h$ucUD`DtjyY4`S#BN8Ca=Npf&{Xn+9%uRy-?sd644PU|(MM?)Y0 zo{28IIQ4wMI|IhMqYW$zPT(qWZQ>k)v6BgO#Aeg0qzojVz1VWuA|~nVEhZ7`d7Nke z^{)BJ2&9gfKAm8C+*>N{fAm4^)eAH&CSIbypD6VG7X8GDyEIt z{a*IBnZBN?1i_)WBGPc&;@LumG;OgN4B5?%--c|b;E6KlPpXCa>}i-g6=IhOHKX2j z#M5;TrnLy1I<&ttROyeq2f4BlEqd6OTa~8(4d^ur^O^dTv?lsYl248>891Vj$Odfz=vJi_Hg|)u|r*79VStQd(uH%)x-7ygxYgEdH|JGEu5jx zPhR-CH}2w!PVCq#S&DlbR7Ri@>^XH%HTuUTd3qeEcfB@#De@cHAy*A@gfRj9h=Vq3f=;3+-_)AV+)y z0K!X1t;RFlS*$&~rTKaE^o(5>%Yu$A>#zF?%Ld_s{_l5sZcd$HM?LpTlZ`gG;AKM# zkqzWQEXwGUlwm>}J{dk4hJ?O0vu?1G zQJ%&gMEJDDWnx`l(+p0~BU7{#Ur(v>IKycQT1QsQyIU1{hzt^T+91CAsdF5c4>cZE z%BRTX(M{SJ7i6UdF-6Rh1c_|Egys z_Qwjzz>D6RcFfP@U@8`SCy%@&oA}sXZ?e_$45u1RX!&#Os}mT0{r9dr*R#}I}=IsKq+zgt92OY{2km=y|m{zEiZ_?qvP(8k;Q`dQ1Og8w)DTLDYYXT`&Lx3%dW!i{AC>WSECz^6}ycbMAVm6R|T`+P}iF!`g?eu)L zIb_mDO1{rUIrX{q1kT)>vfcRH*j*X$qELHMsD$xs=Un>(AiO-Kq$Ect=2*`(pgiHk z^P;BByQVQDz~l867vxc+9po{*py^Nw85h!uqT+HSmeR#_2M%Vg=F$9RT7$Wnt^dK% za)yWk;QIl#yn@Dkqk+%TTqE0B3uwZMH_VCLK_N$magATS9kTRp9mENXabk-Gu-nYq zW=jq(Q>JRW{qWi?2?=+w4!O*SFg-)?`Zk-c2HX1H?9hLuAXF{jW2`g<+@}_GL=e`N z=NAlcg_7KOBWd>gQ{&wS`nEg@@A%Lc$}fT*0at(vWzXG$s5b;swVcta(C65+PnhbJ zHrRax@hTGg!R;5uCg@?P@kM$C+g__7+={CpZ83Q@Kbg-}J+Qwv6}8&lc}1M1>cA+8 zut~z3a9Y~{FQOa^tOi_+WM>IhOx{%G6Kh0Bbrg@bSz@7lGf;ti=7nZ@3>~MODvQb1 zQ4Sdx;H6Gm&>*GeB@ja+KLlI0TU|09Y95E$;HV|9nNx(UStcm&2PbxT&u4QKA}67G zy@{XJoXUINNS3N;H&yr0&~R^FMr-g`oTF?#O|)!n+xH}9?A>8mZa6lyd$$2i1vZ7B z%5%m4(eWx6qLQEPy7}fpD7hSXMKHrIXo_W;m~6Mt?>BOG6*s>op2)rIcHNn8z6|ML`(FZRULzUEyo4Px?1qElL_2W zq8}e&TRp9hPEx7fV7iIAWb$+W3Fi2jBh^Q)akcfjeY3$$E%NasWhBE!)RE8DOPL;D z)nj=UGqarUgX1a00d6=v$&reAQ;%*98{M1rd^~j%wdJ`wLRbBEyHfX*g<>a6SigF? zU7Q##&!xo1Vs)`Oh6gM{-TG{drG4034LLUxXHT%Jq#f;%ch~bD6^-2Kgs}n3?U4RX}0xhtF>SdEh z#^!c|gok$J;>BYfm~0W`l4%J{i^u8lVN*3@`f6O38H>!s5NxYDhI#7_pY>K?|8WqL zj36_>j!DU{Uwhkx{WTTki4{fpPu@FyHbk$8C39xXuCJ?_bIYU`^J&D|mv6s9EG+)` z!w*es(65AwNzc_3;#N&Z)yz@$y3=t`&u!j<8-fOLK$DSUlVCuz&dlpz7dZ2<=GcnU z_RaA3x1&a>xp-p=&p#7r>s`f=Omgvh0$f4k&lsaiQB!uLqnEP4u&-qp2WX+I5SMw4 zs_dZU9iall4H>Gjy4&kQc4LETEkKV~>i6SyeSg*G&esk8j#PjNVxsDl#C5bXUV4tU z1P^%PT5Jb&z;%W^%n`Go_bQneQOX<~)bI`xC<{EiLC*R}rq^9QEW>&3HQ zctt5yov+bo?lQ95E1fExPDGHmS+0)G-AQFz0RG4T8tJyl?xZxJN6@1}7s%_`W-tIo zM&HkONWK^3FMTRcyF$k{zW7?4sVhuOQ&WIQV4;jgDr68-t@Jca1n(u}1}sX7F2*th z$CzHH*Kwj7jZ9@Gjulmn_)&wB~kHYK8s zgZ3t=BfCg_Zb>a#`VsvByzWhhTPy48N~sy|8=ZK8F?lolE=kK4FHo*^VJRx`5CglW zhJClztrQK{W9qg;xT{w(;vxd^%y}6Q25E8xqc#Ty0blJe!^>aqgRRL;*^zU0HjM5M zexACX>lZFuP)$|^VPWBz`NYIII=y~9XElvRA7C;fzFpM&6%`e38sd3JLiek5OlnB% zq=kqAH#*_obQ(dm`|@Zt+>0Z`?cktxd$pXMZ2=;mJ4MTW#ketufpKtXO9u%_j^E03 zlsWhE#Xm^&8MROcOR_N;RwV!=ATMe72>m1PLOp3^w(rYz@!S$YFUYxOv#zt+HBJEj z$*~y|*)~m*>^w@KEeVK4BD{4~RxQ4c} zZJz$79HY<$5$toXNzpJGyi*OLXR14?AD)XlfEzCCh@bXVHhj0WB>7wz8}hqBta#TI z5~h4cGP!$pfC2dbeB12az5Clv{fB}q2pG@{7>Jqf^>FqhMx|S%j@=Ro&_4Ge4royW zy~YEJ%gF)lW9J}t)?|;mmR8Sm#Vsi}av=4&ke;r+ZWCkkz1HP?p|*uB8A1J6aMc|_ zjD0ggF5v~w>6eF#4%qZDsxf*V^(qEsbI|)bR<6+`GR9Q~9e>)ON4_QSnF*s=DqzwGU( z-I#M-Pkdw?!0o-_0*s-Sz25pk329u`tkTpqe4(v7E2nn-ZV)q2*83GbKEyNj)1LLE z0GiqRg*1!$zM>M2ZTY<*1WSkY@mZ16bI zi2;jk;+-EIcTK|v$I}T(KCF?A4W4yvz^nR!vaIm3lIikc9y5X^6AP^6!Ae5C7lOu% zYePiz3%r{B4j21Pvbl}dt3i$6M2~)zcrB(g#3~o}A$fOk8AeMnxK-)^16`PGUAET} zI?2<%;QsdJh9$*&!~PN=_JPtrC}ixsk}yU9R%TEmt4rqgUAuM_s;uX9mQP8N7CxUA zFwUOGJ}*2?Xj>H_# zAw0oAr`1U8cStSiJ4*dU#v+wnd<_(EGE(p`MkfZ&6-zOtgB^mMDSZ~3Y*b{>x zx+Heduw%{kFS5UZCJoR9NZ<9EJ2iB8T$nVj>?h%z2k|KwI?oT-PAZRod|~It(6ATZ z%*gmD&{{cClMr}?WMv1a-h;Izbr@{7aMQ)Vdn5P>;0(=93Y{p6AS#kWLcF^k@d#iC z+KMQTRZY{8^q^o-wcJ9BLAo0)PDrGu{msz#*f+q=WX0+B9M*YivD~`sg!Si{vT;Ca z;GHH8_PaWVUWs}o%Tezx!ui1?VO7R$_;S)qZ7Xn{`9k(*i|jV<r*1|Z zx$eVwy|xyTmF>f#!8gWwcIWHG0$w*aoAD|h+Fl0sawb-mMik+qzU?qHGX_^0+p(pq zBp5V!@0X)^dD%V+3UB$VfY|EK%oNBG8z0xm1TFAjZti--Caqs&E3uDDOSl=0Ui{ct zj3Hg47@)lvToooWv{%lt5)&PDm7*JCIhy+s?K*}I?6w>)=ozb2sfLI2xXDw5EG8w$ z6p?Fe1;{nJV9w$`zn>7VL$%ZGY)VJCCJ>@!2!H*vZ9UVo0jJ&B;%mP+cj6cSp2a3P zU?j#aVr(8{NS0G~g3Qd@8rS1*!IyElq2m{+6U*F3c=tNpb?h2Fml1VREg}UIqVx$I zoCIFFX}#;*jEA+|m=g923_a4DR78U?iG1%6Ncqc}S+sxTu#~1S}*5OhifrGGIa3O*L z&}U@$s(UwX47#8tSQJ`->sHcuw2+sv=(i1}^dyCWK5=Y9r|>IV{VfzIQkQt;8F1^Q z9>!QhD#Eo92-cbrA_O6iT>$35UMKWAQg?dKz<~&4dX^iSE2iTc8aLwV;|44FJf0uZ zRH~!>j!!qvu{Nq>r1OPFtclLy`%@PtBR*iV=RGN*#mrpkHRRpqCJmO+q~EPBT;>kt z-^)8&;@JD~I-a9`yRsA^sb<`GaT*7^YG|&Qns*ub0qN>&C0)Dr_u}o0gZSV1^5dsD z1!ys$gSR}oe_Y7ldqstSyq?z_szOd^C8hyRYel_^3b#1}*O%_h;^}c9a0| zCXBu;RsCJ$TQ~0R@2K;o$2kMyld&>2ny>djM)?NG;pIS{xB3w7%u13$eWV&Wod!W- zLEy}_YjA%-0QY43MpK6oiqEoaST3N*9Kmn5s);#W@y?^@tBF+on4c97;;w+cD$qCB zx8v`cM0NYEp(!d;(>_15uEwa~i}?HHO^1+SXs>`=g9Gkjq#HVL2~+5<#9pSaD_uor zgtd=v!5TB7wPAc>ZIU($DcUHtlVz!M3&O*${#6E^?9!?;T1YoPZw^zt;|Z{V48Zp4 z4=~A_)T-CeH`xDmT`7hK_TZTrDsp3>9#5Ke!kr`!H2P>)IWz75L_5zHZ z&<%01W?p)1@+%uK;g0Tzk2SaYoeH2&YgWuz0b)gN&|eYfzm6n^qkR8E@qr#)g)#l` z<6Rt6EiAwwAeXwL5gTN|wxea}{(b?V0FERDxqAAnX5R~JMvf*{?i7HmA6GOT;2i@# zyV;k80^AVTQw8liu05FE9LKk%{Z0$0!x&SRDoeJw3(<>dT6+G8;}={X&`SmQ7fcFV z#sV(ig|trVK@@?SJ>S8=f0aqTGaU6);Iqa}Fl&SV6o9M24wEFr=+rxDa~;J4LcB7u zR+b%8PwwB{|XEc`@gh;ZF z?Q86yYGDG}r=EOFa7D_U1?)B1*_s35)c30-Cwkf){2$VHeC% zPbFvdL*7~#7!dzVp!=jtl%XSRo@%`6jcV0v=&O(SsJcDohxcg;revWTR_$;f9GT)~@_<*O0Rs#K&he@EdNM#LkU1NAF6K>h8LGa2h0+ zh}_a+pezSHCt$z7rx=%R4E1xgXR%3Njcf?sC>w%LsV~MOVyep6vvqlRmNFtP@D7+e z8GFAhc2U?HcoT)VlZqU%=Z|7Pp4E1T&*y#))7bZDJE;;T&r^jiLZ~_>#HH1%cY$iD ziN#65^1vr0HAsN|ym14@DPX7VUiai;Xz(yO2hMo$uya2~ie}}q1Jl4;5fOIP3CD+k zJ&0Y5^h;{Ib-X*9mTgia@3^tLt9ameCR%sN!pFL7N^ZhqH}pd9PVz8-uYStM=u11c zdFni3Cxoo+lVK<9n@p<)gT+L%VZ#B z%uPTqYhYmdPExfrJ%)@nLh?{Y3wWNoaYeIw5LJ1iyr#Nvl5+UIm#Q&Lh*r1xKq_Jv zL5!NhdrET{kLa^h4gQ#C3ubD2^Sm?#nm(|THosHfjWagCIJ~*II-swb8r;x)5SJrb zF#?zHp*t4=;bTrXE{E$s#vOjlP1~1?4cN^jk}r)%>AUOo>FPFb*;k@x#I5`yppk?G zc*<%}j=Gpho!{H~bdG@~NRR6;&g3BXLGFibB^cT};RFEB`+N`XzB&uO5e7Dg3-dnS z!_UXja@)~GUu{S}>p}c0?5tOGv*uLn91|0hSW=kR?}d9qpOj^Wh$WYA#3f=lKR!DI zI|j|^tb&|m$m`R0;#O}v_>S0=~&=roc~8PPXO%9B}LktnOA zDx9xNP;=oQHSfbXO^WZ@!pq)$h>N&h9aZAXjAu0bj0{bf-+Evp)9(!-Llq*F@KgPJ zc-(Y>`u=>W99e;VSWF=$R1ch0mvw5xP zC`!Q=jHCuzK`A&nC$f>{Oiy}HAg<4IRvFW1y!OhN=CtH|~E9by4q_hzUPCJzo6y)LAS0j*?> zO%RdK@R7o}+jl`wR)9Jud_bSeH2U3QqBUdHYgE+g07VuS-~{>L?KPRaHt&bN~44MUbXp#m^&{rYlw-J;UfY665BA{~ z!+j60!41PRxwNpe2x021<>x+#j{`H&0h#8|*)riYEq^1zf^H7dA5Am|8QeQ}S4G~v zDKMssH=r*a&$|uz_}7-PE!zC1<&41+X<#uQq;$yPK)NP?zh365#fbVa6}SYDH#{BH zKzq8b2-C;10MW^S%$B9A%lMo+ANwB`n$RpXqo+3rEd!6@lsOFO&&&DbmuAaA_NC<(&)<*uc4pi@`IEm6h))9QQwD>l zYFSxX?Td?wE(kqZlawzsiD9u${fw!-ZlL$>yMHxajZEeYooj960boWD0gt`5H6n0h zi`1ZtV6H1g!K2)>c;%EXT#J(?|Lh-!aEUiu$y3Xly%=ALsVONf85FySLpBG(twzR@ zIu-C8yc(FLmQi`n>5K8SSBd!9^B26;oU8^_KAkW>z>UF7N)YJZ z_vB)-Gg93z$(GCmnW6n-YxIRlJY)Q^!kS3n=kPP6-&;E=Fw|7S{lPZr)uSlVZ z7Jpd~9|vZlM>_k>O8!~@NB2weq;bQFM;;&0acr5j2~*eX$2%i>a0Znan6LeCGv?Wu zE|he5hsly_y^$*5h*_6jA!5QMnHqu#r{oTEI$ACzWjY(4m`KkhKMy@ ze!J(oT;oDmv<~J*N|8uB9XLC>5)UUQ$@d_(`tjX`xYwK30%UePmIUYFk>o_VXF($a zo+^s&-sa|PPuorr_Ta6EH2Hlk4Y+5w_;M<5o8H3Y>?MQl(LBR< z*OlTHF}7ut8QcCT?Ge1W`Et&rWkrqX(<4R(s;wDsFsM5Eo;XB?$XR zdiYRPyI0aOGPdp6vxnPLC~gLSZ4jSE>h6TX;b*cDqt{ed^U*tXwY8DMmgf!{<_M-H9?wI?}T^KbYRUSJ*Af3O9kwZ9!4Q?6Zeb1vCKk{hepFY{~{htN#sX(m&n@$@^ z5%Zr3Zg$KAmG5`Z^OB;xezn!rk>*Ng*yw%st9pt-DwwBtti}tI2f(7!!rIV`^%M8C z0z3twW5%Z;Ho(Z4drd(#GSk8pP%&NYa$~{FeYnjNqk@?viR;!d^iGaYWd(t6;p83s z&Ot9{o6GU|J=rpd>7PbWwiS3i=J&Q7#jxaF3wY05kltZPXjkMoy1nNWeWR}F(GeFsbocnB^*k&B>3}l`?cWTn>PHDS{}`#Jk7twKh7v*pKDfz5@_i8ACY`*AU?N& z9;|?anp**f=*Rd_m@!mgdU~&xZBu``cMTsKDnO)cj(EDC|4+p&!tnu2TWf3QTGO;0 zg0^<9xBnNQz4L&NqRij;GuyJ8-aF|ep@y#XqKE|qdqY7z8=n5{r`}o5ex7>8)6-M% zZ0M;VBBD|PQUpQ?38awTdp6lkvUTSFzSDMQcLDpUx9^9S*`1x8oteyh=b7jIy}vg= z;y?+MKnawei$0d>udRq;SE0_Eq-UY@_dRv2G!@#VEwEW37V;7=YXff6Xc zI2hAWWk{_hb@?wu`7S5b2e>n)&pNoXqKTu*IPfPAlt2lTU!*c702@bYFR7eZdqqe` z{+xDX-sxx7uHRS5UHO|8z21QS$J=YT&4+8bS+nPTnw^n3KQk?Ty4Nx*1E3C+Knawe z6O4UYq3K|uq|&$#DVsfo@I$030Zn0+@n2NL4==y#p$wbdfhqIOiu6+Q8ELsAVRJY! zZsMdbt7>b$TyVjq(=NJvS?~CQ@oQGD{gTTWIeOgp9c+{S=7O{lYc3i#daLM7X9P~K zKnavU`L0DZipR2L%W3~$`x|epvWf{dS5c)*rfVqV6pK>$QHwx&Ur6dB-!+l_cPd&v zh6s$K6H+jD@g*zq@q2|*s%D$b!3Ug%#-%);laW~%8Xj@>>E~Y>dDU(AFWLB?w@`Jo zbnZz&yYe?o=JNT7uo^rynf7LdjiW%a^J)6x$2m@W1$Wlg9$+QwfSZQuEZ?g2# z`SscqCxo%6kg0t^vs(mteV=xFp&o7jj>5vrsdD~?ze|Kath#D3=mb_$}8qDRx4++lm{#m-_t~buj<=T0UOR?qxi&wxvY`@?3iW<`!xe7 zV+QLsYR&Xm{>A%PD#Q4G(x;R_36wzjabPSh_ze@~^zYfZbDq=byo9_y_gidBp$L?3 zons>}JLk=uynMUE#UgrG&b5Feq@-*m;BIhMBA%$;8QNri08eTD=k7)fW5O^>6Dfk* zqktvlT+dmtfosCuY?i2)p-|2qSXX0^&(rr?b{N7qjY3U}HU}rY7IOD}s_ntSpaIN@ ziNc)NXoQ(`{Md(#o-9EsV_@jWO8n~RB2Yk0wsqwR_Jc4UNs$oq5e|2r`8T zwgsteb;n>ecVooF=>)_YKG|5&m+NrKf+hIu-~TAwxbFSA3Prj6#*i~sCTk)*V+E0i zfHt=oyBt-xB50fk#H4O;9m6TwRKzkSd3hyd>GZFzV%);y@}9FiA0D->5dr%wQ-f$7AQ*dpSDnm{Pfdn zdb+#w$aiOahwY{6`3-MK_{*|Q$2usk?-`jhX~vxt$Qq)Ai$4Ddr<`>mzhCVH&UBib zosZ4$zEb$*`VW*ae*VtLD{qrQEUMqvET3UT$b25mT7Eu@_4X1>GvpwKf|&CFR(jpy z?!ysxE6&qqdOlZ7CEyOhiFeS1W!fB-n${>3@m7N2^=u1Xa1*GLPk3dmus^(Q4q6Uq zt$0`8ja!|mK8=Wsm}*|0NB`bLR39KJ`&IH=Ep7*fX@kt!L8iz~&RIy)TyC=T9uofs zhXR*Rpaja#1jfe!bEs%#oDj@(2rs+McHXD!*3BoXdD?f}e%^a_I-QU9Su8DM3ko8$ zva=(U3xtJ*T{RGr;)+Yld8lgtR~VKz7NL@wi1jnWFd+PRzX%$y@#B2`~cslYEwL8CwnPktq9`( zuywdhmnS#&Db@2=1m<738}JlH3ATj-(kee&VU0Ygt;a|v($k0t1-PMZM5M22bjNGj zHr(z?MlMH_e{!l(nD~6dG}eZyY*9(?XRY|VrKPUaIdJ=Tv6liRPy*#gfbns_OsxzW zjS+M*Fq?aNuGq49(+t+lPW>MHvn3%hvF(fn3qCjZ^=;U^c@xso(?dp&9us-w$dNmS zO+W9dvhC|J{>+O}xqCCR<3kXXHv=C%bC)~4O?8R-3w!k`{tFsj--p62<${yiZN06}?vMZ!=?8V^7`A`mQ&W$2U*{B~X4M z7$3)Drc8*(H^{t8bDmd}mCgJC;LHO$J13{=ybCV)kSOQRAARB(boZDM7MFxumS2q5 z*KES1%-F(*AALkt#?x{aY%PdbG>3wR3c~uq{Rr2DBGr(=1C>IwXmBSc1y5H1*aPl* zM;$ty%{a|8(F18t0c6GU{~X$Z<-v1QikZ%@b$4Qm{V47+ctKjJz~y=5=0H^HM@&0z z(2Y>Q+P4wWqojc+xH8<%-Yt#1t$g`LeqTLVr8xb#{hzW;FGE6fa@f=G?p=Ac8}*E#BO zo?(>iO>6wv9MZfs?i%FlQ;>#GaojpZ1EB)2(tJ9w^ULls+@l$-0I=6o0rJgUC(7Ag zJm5@~M`7X@kQ)nWOtgl7{i?MU``Y{Q2WJ{THkiD;!pCNKe}&@7YGJsDMWuC;=G%1V8?5z$_KoK(}w-z8NKl4&}4#_g2qR z;Txx>PMfwYtZEwVX^38DBl?{kNDrPUFH}9S3)vz5ZSEuepCLOa4~hC@8NxiQ<<|o6 zO6{ctI>E>@tG<9cV}llmLu> zL@-mInGS0GtwznFzB4H^H-GWP8Lh1?F&xWH`ysYV#{>~od=}z^ghAt}Wf31V944)Q zx24GZ9wtRzB7;f-vQARar1rZyal}@NNuj4J)Up8F{gyAWfWTE&&=P5i%X%6M8HM_OQd5WN z+7ygMxB}#mZ4c>v4Ksj`HEp;dA_2*vMnO^Y4S@oLC9C7zwpN%cY`A17Mvk$Dignm4-+beZA9UNI4a1OZ98Kyd)qDzST9hi0mO3b0&}K=6o9w4{5A;&)9|VG!y1##A4F0XoLmpg~mgrx~87B#fGAW9(-NX zRrl$kW_l8=`XPFJ21=m(Pr+C!G*xGCSj6+W6Q?iF9(O8&Bci~v9AEo;yZCtF-u7mG z+2F89gvKT!GC2cX4b>>$u>nRhZ(MFV3dW7Ye?D9bvvmmbrccJ24?p}tx6`DJfRnW$ z#UwyFOc(Fp2kBeO!#YYoOf}!)>OjAv0Xe2|vYnyj)w1WcHOl$~GZFABTKRl&qxH4T^#< z{deafJmJa`I;Cew_rC_C;hn6d`#+g)on-r_woy{R0;4w)EHS>N8VlzYs{?6es~0 z9|y?P!AQy&zASs}q;csZ3J8>=36NQS2k4Z-`N-}qXs@rrs2K}j9~!`+O>2=oaVAnn zOh89N6>1LdL`wd6==vJ}YS3w3l-6(1>+P)vI_t8?!*;dp2x^8(;b84Q*!EMagIXId zq9*CoTT#=ZY$TM5q;hqf$9vvs*?=(-=gDeXBPwPsyrMM}uUGfHjwKQ2BSI7EM?DKb zE^}0ab9UiueXam_e~pD?`hxQ?mY60Zn9(Vgk@9Z{>-xxX6pPtNWOF7zYFSveow4Je zx(?i^9pT$}NTzkLVijABXB`B<)GSu0Xd#J`&|rALX~oNpEm%I7i1A#gDs&GjZyQj? zPH)%tVZUwwnv5Vs5S2YCJC=v`Wm6~huKOT#OsWeg@Id+1z?dev89rvh%DLxVmJy$l4wmCYMJ#}qB^Um& z;vyW__Bp@&^hKAVy7T}XLxUJUZy|JsAau1f;Lz4jF>%2KFoj2LkcQxzUHtBTxccbjp*R)*hd^5jrHRRf_2;&ogL!h_;5LKEc^Zr8~6W3{Us zPdPKi4ldr=?t}zNHyLU$iHXIPT#70-S`N!oBwwQ4$3HZ5AS&C4MOg_*jxdQa(Ha4| zV*RA54OpI|{pcP}JRz{hemyMT8

CPE|koyrpjlE#{%ZeYM?h1iGXL zVEkW$G0h!(*@`=tPn>xgzj4Qo97b8uj^`NXkkx1~6uX~5($vbJ zQwrY}qSyt6o%4;?L*Y$9uZ;XqDqRZMCbC|Gvja1XV?4Hp7mR6r$9@QNXk#I~Z4hCZ zowgF3!WwzN`h?*r6z?qR9ygj{_v=0rv!Da0s(n(+ZORofQ0@oZTw59rHBq{{Po8%2!2=I7C=RwIA* z$InX59~^9)8;BeU!1$+v@VL_#UwH7GORvHw@4SI6pM3a;Sde>)b9vz#NR8(ACTp|7( zJ0d^7r{$GT-@8hedpb+@(@o$?6;zSC1M~D*{CirdGN#k7oec=rh9OO##Dkd5=Rr-b z&Tei92Zu^9nW$F!S-;zYMtc=+ck~j_{g8!%npO(s6M>mFCepCHaS=<7yA1u@Ja`bZ z31pQq(^@a}yk=-0Cd-i@YM+_H`=kCfxYc+nLZMTuW)YAQY*)FfvCwp?+WsKR9dOhm z-Z_BN5T{bn%G@1~FqH&U!F))bx)-84^WAo~4fT!=Jm5<8K$O1F3Cflsb-kt+FZZ?K zAubm=w879TQbg{F2u~xTmHeewYTNi=y-VG(NahT_4U%+Y^r9CnCTMEaIq_V76Bb6q zVoq$7B8(4;21f|nN5EUy*@LaE-3V#rkkqe59%sbF@Gu}k!&fFm%Ud)|um=0B7E}%n zVq;ThU1@heAC?IrMF3v{X;T9*{!So1^ZZK=e!Xq;Dsy)i&2wO;3FA}JGv7Jys%xhf zoHCsU?anVZqO!CY7hZD{f`}TH#)_iQUH|rLj|VeOS9^D#>4F>XxJR-zHomv&(XqqR zlpr2l`_V_uv!m1h^=FK}hJu){fM> z2u?AMB-fZpP|+0B5MWJJG@Wkj{eY;aGf_uWbgQG0pZ?HAf;6>2Xe{*1nCp(a?2x-3 zW&Im4E8+qXz^nqstQ?ZJd0+=}A{P0E*@52ZhOoo@Ayx#>mld+IAyF$wXbj*@*D)*( z@$G52ac~!|XEF%LwSIdS(uTkr*?qXtG#$e<;eL7_3bl#rM~SVL&k^n0SSaS-&{+9+ z=tneTk%eG&T(u%nM6x|f(8o6(F5^-#jty0*bOH3fzX5Ujp0avtRK{MvWL}O7}JW`^9$@(;W zW#}|W!}pHldq|RM<=lAd>pFb!b^WT&e%tMV#>oJTPXy!Tk~g+&+47*MsHnVQsi_%^ zM#D)E9XVn|M2FSBcgYo3^I)dix$(nySM)a4WZE2#f6-+|j2Mw%(}fP3JZJvfbT_ts z^1*MD!h^a>N=gojKh?cz)ynRy^o%)wXY;nw7+Q`IG!O;v8%zt9A{8>A`KsY_Q2d@f zjE8mO`S;$SUgNC7Ifi^6NJ|Qu*4M0?aenm0kd7`V)*Y5Yj0rmp?F7(G_G3S|<)QSb zsqGP_i{oF@UW7_mGm7vztr+uscwjq*h0gTrQM))hs{7Vq2?4Cu%UjkeqBI)Z?bzmS z!_3gxs?I9BpG}U*HM&tUw7nY(DTp6+9m4e?3)H;|l@pxytS5h=vg0NIZj~<;FK5Y2L z+|dQ7uReyB(tX!dR95a3Kkg*!>ZSX1{jhOcA9JS90V8M`PMp3@9z&PAlZE#TWJuF` zCsT%JFv_{iI8?fRMDjV%XJhBBF~l~EcQ*qF6(ZjhD;%%Ne#UH zmMxeVHV+O0y(Zf+8P-2!l!tebd}<2o4)R-hq4SR0x#d9bYMdRuP_3vrbw`zQM-@uh zK1>LkFGqbypO^N3hRgZ9(468k55#G_^UJO>ToihSDgl!+4cVc|F(5 zN0ux~OTeV~NI{89jgwl&%bymeR<|4LDq6AaP%lC)CO%X#g9#-!h#>xNWak`cU@d&D zhNZ5A>nrSm<6|k82yJ8!SRp?ijCKEwjUXVV0M3Lkj8hYV<{+I<({YkY=j&sVQU?t> zEna@+vHON4#rJrFn3uiX-7SqBy?2bAIQh9fUwm@^ptWBDaT^`Ck5p-|`4b!B&PxxM z8X|Y}tc!oFlwd4DQ={Wxay$9?6u6uSW%^kV<3c!5RDAzBUNMWX=7G%ckWNMrc`#;0 z_ot5}?Acimu-RRBPB=qvBUviaWlqLf6$X|CRRq_m{ie#bc`FT{KTFc=Yax=IeBcUUpwNdag0)SmS zgf;xW_p=WC$$b>3YNK%;5;291CdZZ|s)VdhsOG#@8 zH!$so4qZu^>)Jhn)i$L@Sw6ahbj|n|~kMCYr2LW^y z0rjyTybvC)1M^(RAnkQH&RPXZ=sIogY{#j*_- z=tufyv=b}s^=C0ABV8BoH>p2;;&^vx7nYi)B81T@@^g4afpDCvA(H0ySnVvs<+?ni zGodQqS%Ul!=fX?wGK3j3Fx4;|T2=%d4>&guJidzazjilcC#f{%I+2_Uf@PuT@WZll zkIXNbPJDqbJmbvZJJYBd5v9>yq(($3@3dwB&zfyGHDoRk1Ul$B;dk}ow&*2Q4$rpH zUY_gO5=q3v0DKtGXCx zz_1lqqp$^YDGJGISUq7aA#P8i{@Qh?9f?86~ zq~5M6$IXdX@Sm{?tD@4;f+OU<^KD_gsb@7#kG}L9AHUswpJ8gmS*qWlwg;X5l(42s zKS~#aC~Yi~h@zv_#OP(OdB5!-bfz&rJs6dWv&#GlmIf850501dq;7OnKw}z?6kVd< zF5-vmNBQ+l4$4!5S`o^pR>B+(IP0<9S%X`Q1m2>Jp}q-_M>?(#Z@VgyVjPW;hHMY; zpoXtbgx(fM8G6|PEYpqU=WF4yDkK|2g7V+EV@PFnSk5LNHvyA(BjO)*ji`@06{*IN zFmS@Q=*e%(Q&xhquuXK@?drx6Z4n+Hoh9^k)Oos>^LeT*19)wJ3+D8vA&VRd-ky?f z9Exq18gOO%iB*RozNhQL^SYL+198;9Oc>`X-ttz4AIQ?=La7q0$ujl+_JP8fOoC~0 zAxlHujk5-OgU^hKx$2Xq4&-W$q?*7D5o8Vskmd-iS35hA>oj5l8=^u4JzD?Lwc~7e z6q0DT6T^5K=oksu3_ps-{FFrkh&e`vaT$Se1*<`#A(i)-slVJp)H0n?7_*9qS8o`f z6@4M#eFIQzmp>2MU!e_#HA6>arA<|}1nD2!r#@bxjfUa+co@keGF8?T&U#G{9|bas zb@7TI=9Gn%AN^C)GAuol&@PK4V#t_qyV z#UO`hT7LawnxJ_GupolqsdwSA!>-~Va)>eUpfup!HG*uoqk*{Ko>Q5})*wEx6 zH^H&uc6z$&>%2hh17=nj&4!MOUd&kdc=5;#6Tdyjv_r5kY++Gt_cn18sijf(zi4gz zl3+|5_h>6u?Hg~bavpDImM_0L2SUVW6IpKX1h$CJ2-h@ka>L<+S3I3+yqP@5f^!gb z+R4;Tr!4+lQ;$2M$jLe=fSRMQCZAi}F5cp}!ZM#~j(}tfuex6i+jOdSVQ{uHzeC;PHGa+n_ocDwAiW1q^w+@rS zPA9A}coel)>@;1r$JK>h1j46>%s`|j)LM7q z!u}RMQbRc=qHz5K>-D`j5SoJVx^&M?gA?tR?h*&cksG$1=Q2#YckkrxvxD{H0A z;pyPQ_;}FL`01@cRR zF+Zjr0W5ox<*45rLY8Yf$-1&|oN@Z1LrlxAi0HjPH5hb)Y4i;fd#2Kn%*u;C)odnhmGR*(mg+_@`m2J|5^9_;I z+m6)5@}Q=Pnrem)A|ohY$VKfn7=UD9^peN;&4XVdn?T(=*^{@eLX~jqPzk^8ai&QK zX0$S>J@$a`o<8b#5r}QKmEs!xXg==RUjin^Pu}U5+~ue-rSbzFeA7aEXI?otfV0?W zRabKf$e(Gt@qUOA^MYrHO5fvodpu^32foQsjt;coq4*3W(NSQe?&97(5%p@|dLll6 zPwLvSwW<>djSLo&n?oLJg%w)(4oE+ftSVqpAjcj%jpZ0$Xww6ahO!#U7@ai+a4 z?y;aN9HLYEm$19lD2YqF4ToPhMfwILfQ=h z)-K?06<^>=E>&*)qnF%n^ze{8m5K4dkMY2oPM>j=;L4Dhz5tf?k?{cY3;Pi))(@AJ zwFm6N`~8kuq5?yap-WJGPqJCQ9N2@aba}oFnB@Hi0ZAIRfK8)*+n+JH&z6$&wf2r-pCAOfz0 z0NCj^<6OgNuSP_50w(2XEm9-p`{=~vs7p?ouS6=eTm9*lW0)`Yc2s4^Bd|{vI7CQO6;0v%Y-JtpoOZ4NVpaq)S?uNAg}AySMCJF=)9*=b z9qwf_RSH#tBZXiqxo|cZ>^NPQDbI(}VqiRAu5foF!ZpO_j*#M~C9q3Mn>%tEDB%X-zFjJ>msnNvDac^$SHsV_WKFs=Nhufb!rg4 z$Em+h?U-RisZvOH5LMdewxW~l3w`}_F5aFueIH4bIE7e*CIl>NL+#K3KH4Hl6X_Kd z!usjt;(;`c|9#Q63&RL7kC&>|AB1NN)M1V$K@}yU>=<)5Q-+wxg{pN*@D*+*lD}Mz zU~4JvbEluQ5m9Y>aM)(BV3oEP=WDVshUg$w)QA{kt>wJ@dseVvUemT>FjI>g#$+Hl z!l;h!@QU{Ewmd9O7vA1ehgQagC5v(}d3>T`!+_>CE4F;q$WLiWNeV@NZnS(H`YjF~ z(7X4x7FN}mf0P7H|MM{RV`0#Ujr8LISt?`4jvcmV)7sxYam9!wd=@YTlppw@7*~u> zLvBo%IG+$#RMyWHcjB%YGlh@}Qpq`oSBi>o(bq$K4>C`*2F*^!;!%Ago_1zS0^m{V3`G0=b5C^s!c$-4+7y9{%@!X(dA&&+m1THd3${*goE;mOy_p zJK_vCgX!92PiGfSnMses=k7-AF{PexPz3tD0arVWJsWW+m+jMt=skF?()O@&F5IK9 z$86Imj9{Wmw-uacL-A}>&813P1x@m#P4pH zg#4T+c}FY*PQ1CM3@u#)0+=TxobVX#JJ61`>#O*aIxNJ1@KA#c^0LaFLtAz<(q4<- zI|p_E#>clX=vkilUDv_G`iqZl`Rmd#i^FtU5!4)i0=%@l4UaCFrvmZ=Pprm++BAL{ zSviyW7RHU|b**^NnX0%f)RA`&2DM;?ZlnUjjH2;SQpv!(jxyZtO!fh_v<&qL^~M;C za^?|kY8$H}0d9-fC*@k>iLnm3}Ljv1{ zhCDvRP3d(@uRpXO#`!@7e(e~#&Yf&8#+$~7J~-`kqvic@YJF&J)tIQyQmJieo9N9U zCu*Tj??(!AYM~sq?t+1JAyF3%GXZ-0FBIm{OfaG~Cd4p8rLo4tm~hzO#H-FDyv=bU zg-yG@*O@=YgX}ONhsX&Auhl5>N!Jhw@Jf&c^Nkbu?`grR8!lDeGfN&G9eCSaNq%sM zAN-cj-shcw=>)dN`o3ds@)1qm}OOEBqu&=TOdp8`y_1VLOIA<3p0Gv!Mb{Af0YQi0{!$j*ttTu5O z!iPU_Ex3crk=3LGP-S$-D_kw^ABy$ePw9*z|NdZX#VwkAA9XDO@XyX-JZB%~6*?z; zmkvz&n5moG7)E$*2#i`?f9S5kEKM?EH73=%|Lv;475aSj4M3sl*E)~kQvFEv{)yjx zj{xP2piz93gts4F&!SR}`X zN{K{Nz%8%o#mAd#c=(5h8$_rxGTfM_ocDR;+LGUPMcA-GKWe$%7U7;78b$4`e8;e^x= z&ef&Jkf&4o&s@Q$YvcLtq}{#~2*?vT1Ah!?N>d?&pL%#MI^D>0qd4A%Q!mWK;#t1% z<=aN;d-2lRQvB}bNk~Zu@ng1d^2zw-+Hy2ET5;1wV|e?)XMN=LGb{HaJi>sbOY#vO zV({fScQRN|Tf~94$Z!4Q-NUz^XwcjMjK4pWHKJ;kdI#EhF(;_!sBgPYX_RY6!`>;k zEzG=W^+a6t-^gMx(b zxHumr{%~y#uJ24!twU*Be8F@gJ1tD~Vg|8* zi&ZIHx?b9`gGP|3Y!-3*pJ0#TR!tt?1I}A<)6NbbxvFs!m+sdO--Hfmi5j1*iSw~( z=#(n+Ndyr2QUj+PPDt0I@w_K>x3dw|u1@^MFh-@crJL7EM=`8gBNm5D_lXaely{k} z0x=FVCM&|@lu7JY+Yewcc=R`p8KznI&d|j2je*|yQ6BwvcP~D09L1Q(^ZiC(&?u0) zft{ETG(tF;3Di49$V=}#$U1%wB`ny0y!0p}#|hA_Xza!BuAaz)H7_H|FD$q0ZNj=O z)wt)%iSqY+?uev3@bcGKdQQG)JtsQ$b-j3NLplGui|1w`FEdKDKE@01Egb{6{ps!I zqt!jC0RsXs{_$ZQC4yP=UA^b#hU$4vYja9!w40eWF8(SC-4|Dv;@)NBke?Q*1~P%^ z$Ghs$@vk;q!p8aanV*Ph!V|Yl5&-O009^t1KW(eTl}BTEAba%;=ijf=v|_=M6y!yR zLQ1y5N&UPDkl(3nM#dhf{ADHBKhyN^Drh(ttlBo}TRSy6KJ-d;BM{HUyKw~1Fr$3@ zY+m^j%>C6}fu~%VK7Kd_@*&2861@}WGAVu=J4=Z=-@rEFLF9`H+Y`kiD{h7IE+lHb z6lUC{8|h=O(DJhD2r_l?n5r8lXZcfR{G-$N2KVADO)}D$Fuz=Q(^Uz3_{?tlfJ=BwgYkK!d_mWYOWbinH*%HLVZ~nU5SDW(zT$?g<2ua| zJjdpH-6%X5c$lfg!;WOW0Z8eG(^AO{;Y~v??m~_V^h&Vb%a-Cd?nGYM`*fsX)SY4I z>z9~%L~DYvh)MD5lmCsa;O&=@Oq424TseKhu@`r1Mu~YZ)VHod&V|Rha*T+&5C+EJ zqjnjuBe17$J?>^Q5yI$vpX50;FxR+i&|pl$DB~C(CEez1LG!>q+-R66gI&3SKGHz4X7{=pQLJ%eR`4Xh?PGKyi`$Cb%syOyLl)+p zlB$N|fCJC0+OJZxMWyZX=jFA%d=$l&eU1EmJvkwi{|@i_l^d&E|JG2e~HZ#hL)#hz?P9De~kcFPVtXikE^XTn!??U>Dm z^U2ti;H86({#oCOTivO$U802Y{n{Fw#l>I-8|l}M*}?SkBTQe%#(Od|Dfas=LjxY= zas|u7kFQN66$e5$Skr~dcQXZXT?CTxSUfM=*QyY&w?F)K zH&RkVv0_<)&pF-FK7bcrIfOsnJQb?ZAmk~Bmp&-Ll<7&b1wpTq&0h&UO~b;3zfJkE z9$2g%@1DXpKPQ?Gv!j*9I!g_UPyKBl9{SS;CII7~FQ6%$qko8oLT_kWES3>=n@wJC zkkL5kv>uu;c4S!h;`9$ZJakWh;uk{0M2Ggat~8ah_k{B?WWNUa#;V`4sql9NQontr(Sb9J6MXjA8(yE= z4&n;C2|1!2#Q584F=&9U_iu(CEH$0#V>`6C`*6%w$$RqtED(Al`H$X;B|)Rr>X-9X z?$gfyW=5u2$L>2kw7(Cmy?LsvuOnfXqp1{LKTnc}0OLT4VFJM!dPY z4ENqXN!4+Lk5^t-g{0tMUcoCH4d-6>;o3dLW#;aea9i{+yYE zTvM!Q>n4>)#VbE=g!whY*kHzt^yBZ_yE}X^2`1~fso7bNVZMaomfM6Oo&iPmG z;2}DMPn^fF)ESN8oJrM)sr*rT>>t%PVXh$)qY-hOHa7vx)PG)^(1hzA7$=9El~?xQ z&3DSMbjb+hXGi%}${&AGgE!ZeW7R*-kd>``&fdKh_gq8H=Pa*ALB)0X7Wr;Pvl%O{ zSA$vVFhXw<-a>`0`@TnZtj-;~_}P2zxqE-WGY+WeAGM4bGiH2KW8;XZu)*M-Zoxk3 zw*qC=3{nPrLw`3q`JynrUe2~(VYcGV9Y=8Im@LdqQ7B~*+7<^x!FkWKAn>b0r`Dc$2za(}(wCdT`go z9OQ+Ccp5*Ys1!+olWC=O0MAx6;7Pln-o05vZzO-!G~rLqG$?OsO5Cj1_TZSwjyob# z%q8Z5G9Arm1! zHprjcrTC*GSx#u=rzH=Mk{e(G6N$x0_T%iU!Ays)YSH!KhVUfNPEtPGrxCWT(mjYb zyF0OHC<)nIhzg3{%6b>mgY|i4-1opKvM``(dEdS^-22-}yltQo@Z`UiwgLS9>7BUp z@{u@gw)!U6+-k+s&+X$QIIcWTp=`ZUH1=DayyB+Gh-jjt(`Kh3pFEvW$kTP)|H$sO zeS_il@e?K-2n!GYs-mKT_U-3W{sv(DKcJ!&YC2r>q=Vs!@L&}Uia&Xg!ubXotMWBY zyFSukjyj!w$$V?(C6 zf~U|1K?-K7(y14HsXImiI?p-XLY8?Uq8+0aCgaL;auH^vzB$VW?mzXfgUH(BK5jg< z3}$UV-iz+VO=ss~QfidCAynbLS9DwO%;yz&+L|s}D(dUlLjDAg8yj$WP7)@DgbP-R zQ_;bPDDX;>{1raPxHq$L+O8l`qX%LD!R)9gV#Q+SiU4 zJ@%Nij{ZTtGcF;u+T7Q--)n>-?_wn|ZVA9RP`o2P^rH0<h+QZ>J;?M6gy2u_=shRNd-_{ZA4huX3E{c2x zEXNsTAw2k$3fDq*?B0-i+_NB0$X)E``^v7wG-=Pesy1vbX~S@{0mI30m`I*vyrXKQ zr=3rAy=c$ZVBWYS%pH~JVUyxFiW++GkM-pk-(Ps%fKKU%%r&!*gQ!j6&7{hu-Fj?+%H`s2Um? z`syUWJuZkL3?0V?U>qplOMQ!xAV$S38kgY|p&c$l{a00_41WFn{psf3-q^^xt_*+6 z9o7u;7g%1h)}0dNLP1XCZzhk5hozsW?|&bHakB;2x#C5&$oMF43ZwTzdoVCUkHrP4 zn3NW+vQ{iZPP|>*047L-n@-DB=TDFZfNkr?$GaNf+GE9KTr83pgI`##)^*`vniKOT zC1c4cX&%Y#wd`BlYw-0)4Y(sL1-B8&;=#7#Kgwdmz0zo{1Hzh ze*Dfi-dJVzcU=Ww94J4jbs{!GM&SghOHWXw%^v_whx%75tYg5S%)#XroI;e%Acn?$f1@eB0N zf2M^#ws$L@cx2E?<6bImtUB@dYsKlHLWR1-dEwvS_vp~*!N++rvwCd{-3B|^mLFv} zngEOgB~bqRfTv)I!O7#rC0I$xslKnur%-Xje>kLFf{%FP@%K3^ZnHPYC>YHOAf9Ny zsFLjcu>ki=DghV=N}&8aVIAzP3IpYT7{-ATD1j0vf%4xe{|^mAGIRjOq%!~j002ov JPDHLkV1i$1cXR*% literal 0 HcmV?d00001 diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index 0c84fa2be23d9..bd5d9315e9fc4 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -383,8 +383,8 @@ IEnumerator IEnumerable.GetEnumerator() int ICollection.Count => Length; bool ICollection.IsReadOnly => true; - void ICollection.Add(byte[]? item) => throw new NotSupportedException("Collection is read-only."); - bool ICollection.Remove(byte[]? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Add(byte[] item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(byte[] item) => throw new NotSupportedException("Collection is read-only."); void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); bool ICollection.Contains(byte[] item) diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs index fa6f765475240..52bfb9eb20768 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs @@ -13,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#nullable enable + using System; using System.Collections; using System.Collections.Generic; @@ -23,7 +25,7 @@ namespace Apache.Arrow { - public class Decimal256Array : FixedSizeBinaryArray, IReadOnlyList, IReadOnlyList + public class Decimal256Array : FixedSizeBinaryArray, IReadOnlyList, IReadOnlyList { public class Builder : BuilderBase { @@ -178,7 +180,7 @@ public Decimal256Array(ArrayData data) return list; } - public string GetString(int index) + public string? GetString(int index) { if (IsNull(index)) { @@ -230,10 +232,10 @@ public bool TryGetSqlDecimal(int index, out SqlDecimal? value) } } - int IReadOnlyCollection.Count => Length; - string? IReadOnlyList.this[int index] => GetString(index); + int IReadOnlyCollection.Count => Length; + string? IReadOnlyList.this[int index] => GetString(index); - IEnumerator IEnumerable.GetEnumerator() + IEnumerator IEnumerable.GetEnumerator() { for (int index = 0; index < Length; index++) { @@ -241,6 +243,6 @@ IEnumerator IEnumerable.GetEnumerator() } } - IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs b/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs index 8f0210b28240f..d42ee5279e795 100644 --- a/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs +++ b/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs @@ -40,10 +40,12 @@ internal NativeMemoryManager(INativeAllocationOwner owner, IntPtr ptr, int offse _owner = owner; } +#pragma warning disable CA2015 // TODO: is this correct? ~NativeMemoryManager() { Dispose(false); } +#pragma warning restore CA2015 public override unsafe Span GetSpan() { diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs b/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs index 4ad5bde0874a8..e5e64b073f799 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs @@ -14,6 +14,7 @@ // limitations under the License. #nullable enable + using System; using System.Collections.Generic; using System.Collections.ObjectModel; @@ -65,7 +66,7 @@ public async Task EnsureTheCorrectActionsAreGiven() var streamWriter = new MockServerStreamWriter(); //When - await producer.ListActions(streamWriter, new MockServerCallContext()).ConfigureAwait(false); + await producer.ListActions(streamWriter, new MockServerCallContext()); var actions = streamWriter.Messages.ToArray(); Assert.Equal(FlightSqlUtils.FlightSqlActions, actions); @@ -115,7 +116,7 @@ public void EnsureTableSchemaIsCorrectWithoutTableSchema(bool includeTableSchema [InlineData(typeof(CommandGetImportedKeys), "GetImportedKeysFlightInfo")] [InlineData(typeof(CommandGetCrossReference), "GetCrossReferenceFlightInfo")] [InlineData(typeof(CommandGetXdbcTypeInfo), "GetXdbcTypeFlightInfo")] - public async void EnsureGetFlightInfoIsCorrectlyRoutedForCommand(Type commandType, string expectedResult) + public async Task EnsureGetFlightInfoIsCorrectlyRoutedForCommand(Type commandType, string expectedResult) { //Given var command = (IMessage) Activator.CreateInstance(commandType)!; @@ -131,7 +132,7 @@ public async void EnsureGetFlightInfoIsCorrectlyRoutedForCommand(Type commandTyp [Fact] - public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupportedAndHasNoDescriptor() + public async Task EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupportedAndHasNoDescriptor() { //Given var producer = new TestFlightSqlSever(); @@ -145,7 +146,7 @@ public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupp } [Fact] - public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupported() + public async Task EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupported() { //Given var producer = new TestFlightSqlSever(); @@ -175,7 +176,7 @@ public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupp [InlineData(typeof(CommandGetImportedKeys), "DoGetImportedKeys")] [InlineData(typeof(CommandGetCrossReference), "DoGetCrossReference")] [InlineData(typeof(CommandGetXdbcTypeInfo), "DoGetXbdcTypeInfo")] - public async void EnsureDoGetIsCorrectlyRoutedForADoGetCommand(Type commandType, string expectedResult) + public async Task EnsureDoGetIsCorrectlyRoutedForADoGetCommand(Type commandType, string expectedResult) { //Given var producer = new TestFlightSqlSever(); @@ -192,7 +193,7 @@ public async void EnsureDoGetIsCorrectlyRoutedForADoGetCommand(Type commandType, } [Fact] - public async void EnsureAnInvalidOperationExceptionIsThrownWhenADoGetCommandIsNotSupported() + public async Task EnsureAnInvalidOperationExceptionIsThrownWhenADoGetCommandIsNotSupported() { //Given var producer = new TestFlightSqlSever(); @@ -213,7 +214,7 @@ public async void EnsureAnInvalidOperationExceptionIsThrownWhenADoGetCommandIsNo [InlineData(SqlAction.CloseRequest, typeof(ActionClosePreparedStatementRequest), "ClosePreparedStatement")] [InlineData(SqlAction.CreateRequest, typeof(ActionCreatePreparedStatementRequest), "CreatePreparedStatement")] [InlineData("BadCommand", typeof(ActionCreatePreparedStatementRequest), "Action type BadCommand not supported", true)] - public async void EnsureDoActionIsCorrectlyRoutedForAnActionRequest(string actionType, Type actionBodyType, string expectedResponse, bool isException = false) + public async Task EnsureDoActionIsCorrectlyRoutedForAnActionRequest(string actionType, Type actionBodyType, string expectedResponse, bool isException = false) { //Given var producer = new TestFlightSqlSever(); @@ -237,19 +238,19 @@ public async void EnsureDoActionIsCorrectlyRoutedForAnActionRequest(string actio [InlineData(typeof(CommandPreparedStatementQuery), "PutPreparedStatementQuery")] [InlineData(typeof(CommandPreparedStatementUpdate), "PutPreparedStatementUpdate")] [InlineData(typeof(CommandGetXdbcTypeInfo), "Command CommandGetXdbcTypeInfo not supported", true)] - public async void EnsureDoPutIsCorrectlyRoutedForTheCommand(Type commandType, string expectedResponse, bool isException = false) + public async Task EnsureDoPutIsCorrectlyRoutedForTheCommand(Type commandType, string expectedResponse, bool isException = false) { //Given var command = (IMessage) Activator.CreateInstance(commandType)!; var producer = new TestFlightSqlSever(); var descriptor = FlightDescriptor.CreateCommandDescriptor(command.PackAndSerialize().ToArray()); var recordBatch = new RecordBatch(new Schema(new List(), null), System.Array.Empty(), 0); - var reader = new MockStreamReader(await recordBatch.ToFlightData(descriptor).ConfigureAwait(false)); + var reader = new MockStreamReader(await recordBatch.ToFlightData(descriptor)); var batchReader = new FlightServerRecordBatchStreamReader(reader); var mockStreamWriter = new MockServerStreamWriter(); //When - async Task Act() => await producer.DoPut(batchReader, mockStreamWriter, new MockServerCallContext()).ConfigureAwait(false); + async Task Act() => await producer.DoPut(batchReader, mockStreamWriter, new MockServerCallContext()); var exception = await Record.ExceptionAsync(Act); string? actualMessage = isException ? exception?.Message : mockStreamWriter.Messages[0].ApplicationMetadata.ToStringUtf8(); @@ -271,7 +272,7 @@ private class MockServerCallContext : ServerCallContext protected override CancellationToken CancellationTokenCore => default; protected override Metadata ResponseTrailersCore => new(); protected override Status StatusCore { get; set; } - protected override WriteOptions WriteOptionsCore { get; set; } = WriteOptions.Default; + protected override WriteOptions? WriteOptionsCore { get; set; } = WriteOptions.Default; protected override AuthContext AuthContextCore => new("", new Dictionary>()); } } @@ -325,7 +326,7 @@ public static async Task GetSchema(this IEnumerable flightDa public static async Task> ToFlightData(this RecordBatch recordBatch, FlightDescriptor? descriptor = null) { var responseStream = new MockFlightServerRecordBatchStreamWriter(); - await responseStream.WriteRecordBatchAsync(recordBatch).ConfigureAwait(false); + await responseStream.WriteRecordBatchAsync(recordBatch); if (descriptor == null) { return responseStream.FlightData; diff --git a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs index ebc38354b5c28..aac4e4209240a 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs +++ b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs @@ -288,9 +288,9 @@ public async Task TestHandshake() { var duplexStreamingCall = _flightClient.Handshake(); - await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)); + await duplexStreamingCall.RequestStream.CompleteAsync(); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); Assert.Single(results); Assert.Equal("Done", results.First().Payload.ToStringUtf8()); @@ -303,10 +303,10 @@ public async Task TestSingleExchange() var duplexStreamingCall = _flightClient.DoExchange(flightDescriptor); var expectedBatch = CreateTestBatch(0, 100); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch); + await duplexStreamingCall.RequestStream.CompleteAsync(); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); Assert.Single(results); ArrowReaderVerifier.CompareBatches(expectedBatch, results.FirstOrDefault()); @@ -320,11 +320,11 @@ public async Task TestMultipleExchange() var expectedBatch1 = CreateTestBatch(0, 100); var expectedBatch2 = CreateTestBatch(100, 100); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch1).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch2).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch1); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch2); + await duplexStreamingCall.RequestStream.CompleteAsync(); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); ArrowReaderVerifier.CompareBatches(expectedBatch1, results[0]); ArrowReaderVerifier.CompareBatches(expectedBatch2, results[1]); @@ -338,8 +338,8 @@ public async Task TestExchangeWithMetadata() var expectedBatch = CreateTestBatch(0, 100); var expectedMetadata = ByteString.CopyFromUtf8("test metadata"); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch, expectedMetadata).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch, expectedMetadata); + await duplexStreamingCall.RequestStream.CompleteAsync(); List actualMetadata = new List(); List actualBatch = new List(); @@ -358,9 +358,9 @@ public async Task TestHandshakeWithSpecificMessage() { var duplexStreamingCall = _flightClient.Handshake(); - await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.CopyFromUtf8("Hello"))).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.CopyFromUtf8("Hello"))); + await duplexStreamingCall.RequestStream.CompleteAsync(); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); Assert.Single(results); Assert.Equal("Hello handshake", results.First().Payload.ToStringUtf8()); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index d3032b8d4ac40..c3c21c412d20d 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -212,11 +212,11 @@ private static void TestPrimitiveArrayAsCollection(IRe // Parameter 'values' must contain four values. The last value must be distinct from the rest. private static void TestObjectArrayAsCollection(TArray array, T nullValue, IReadOnlyList values) where T : class - where TArray : IArrowArray, ICollection + where TArray : IArrowArray, ICollection { Assert.NotNull(array); Assert.Equal(4, values.Count); - var collection = (ICollection)array; + var collection = (ICollection)array; Assert.Equal(array.Length, collection.Count); Assert.Equal(4, collection.Count); @@ -232,7 +232,7 @@ private static void TestObjectArrayAsCollection(TArray array, T nullV Assert.False(collection.Contains(values[3])); T sentinel = values[2]; - T?[] destArr = { sentinel, sentinel, sentinel, sentinel, sentinel, sentinel }; + T[] destArr = { sentinel, sentinel, sentinel, sentinel, sentinel, sentinel }; collection.CopyTo(destArr, 1); Assert.Equal(sentinel, destArr[0]); Assert.Equal(values[0], destArr[1]); diff --git a/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs b/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs index 59080d739b10b..412f67de5f0fb 100644 --- a/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs @@ -115,7 +115,7 @@ public void AppendTimeSpanGivesSameTimeSpan(TimeSpan? timeSpan, DurationType typ Assert.Equal(timeSpan, array.GetTimeSpan(0)); IReadOnlyList asList = array; - Assert.Equal(1, asList.Count); + Assert.Single(asList); Assert.Equal(timeSpan, asList[0]); } } From 657c4faf21700c0899703a4759bde76235c38199 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 14 May 2024 17:54:40 -0300 Subject: [PATCH 094/105] GH-41596: [C++] fixed_width_internal.h: Simplify docstring and support bit-sized types (BOOL) (#41597) ### Rationale for this change Post-merge feedback from #41297. ### What changes are included in this PR? - Supporting `BOOL` as both a top-level and nested in FSL types - Removing the long example from the docstring of `IsFixedWidthLike` These changes don't affect users because this header was added recently and not released. ### Are these changes tested? Yes, by existing and new test cases. * GitHub Issue: #41596 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- .../vector_selection_filter_internal.cc | 4 +- .../kernels/vector_selection_internal.cc | 4 +- .../kernels/vector_selection_take_internal.cc | 5 +- cpp/src/arrow/util/fixed_width_internal.cc | 100 +++--- cpp/src/arrow/util/fixed_width_internal.h | 286 +++++++++--------- cpp/src/arrow/util/fixed_width_test.cc | 21 +- 6 files changed, 212 insertions(+), 208 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 8d43c65668d4b..5e24331fe96f2 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -164,7 +164,7 @@ class PrimitiveFilterImpl { values_is_valid_(values.buffers[0].data), // No offset applied for boolean because it's a bitmap values_data_(kIsBoolean ? values.buffers[1].data - : util::OffsetPointerOfFixedWidthValues(values)), + : util::OffsetPointerOfFixedByteWidthValues(values)), values_null_count_(values.null_count), values_offset_(values.offset), values_length_(values.length), @@ -470,7 +470,7 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult // validity bitmap. const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; - DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false)); + DCHECK(util::IsFixedWidthLike(values)); const int64_t bit_width = util::FixedWidthInBits(*values.type); RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( ctx, output_length, /*source=*/values, allocate_validity, out_arr)); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index 93cd5060348db..2ba660e49ac38 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -898,7 +898,7 @@ Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) // PrimitiveFilterExec for a fixed-size list array. if (util::IsFixedWidthLike(values, /*force_null_count=*/true, - /*exclude_dictionary=*/true)) { + /*exclude_bool_and_dictionary=*/true)) { const auto byte_width = util::FixedWidthInBytes(*values.type); // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec // might not handle it correctly. @@ -971,7 +971,7 @@ Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // PrimitiveTakeExec for a fixed-size list array. if (util::IsFixedWidthLike(values, /*force_null_count=*/true, - /*exclude_dictionary=*/true)) { + /*exclude_bool_and_dictionary=*/true)) { const auto byte_width = util::FixedWidthInBytes(*values.type); // Additionally, PrimitiveTakeExec is only implemented for specific byte widths. // TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type. diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 48a2de9936cd4..1a9af0efcd700 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -347,7 +347,7 @@ struct PrimitiveTakeImpl { static void Exec(const ArraySpan& values, const ArraySpan& indices, ArrayData* out_arr) { DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth); - const auto* values_data = util::OffsetPointerOfFixedWidthValues(values); + const auto* values_data = util::OffsetPointerOfFixedByteWidthValues(values); const uint8_t* values_is_valid = values.buffers[0].data; auto values_offset = values.offset; @@ -588,8 +588,7 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ArrayData* out_arr = out->array_data().get(); - DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false, - /*exclude_dictionary=*/true)); + DCHECK(util::IsFixedWidthLike(values)); const int64_t bit_width = util::FixedWidthInBits(*values.type); // TODO: When neither values nor indices contain nulls, we can skip diff --git a/cpp/src/arrow/util/fixed_width_internal.cc b/cpp/src/arrow/util/fixed_width_internal.cc index 164af3cff66b3..3f12fafb54f0f 100644 --- a/cpp/src/arrow/util/fixed_width_internal.cc +++ b/cpp/src/arrow/util/fixed_width_internal.cc @@ -33,11 +33,12 @@ namespace arrow::util { using ::arrow::internal::checked_cast; bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, - bool exclude_dictionary) { - return IsFixedWidthLike(source, force_null_count, - [exclude_dictionary](const DataType& type) { - return !exclude_dictionary || type.id() != Type::DICTIONARY; - }); + bool exclude_bool_and_dictionary) { + return IsFixedWidthLike( + source, force_null_count, [exclude_bool_and_dictionary](const DataType& type) { + return !exclude_bool_and_dictionary || + (type.id() != Type::DICTIONARY && type.id() != Type::BOOL); + }); } static int64_t FixedWidthInBytesFallback(const FixedSizeListType& fixed_size_list_type) { @@ -73,16 +74,37 @@ int64_t FixedWidthInBytes(const DataType& type) { return -1; } +static int64_t FixedWidthInBitsFallback(const FixedSizeListType& fixed_size_list_type) { + auto* fsl = &fixed_size_list_type; + int64_t list_size = fsl->list_size(); + for (auto type = fsl->value_type().get();;) { + auto type_id = type->id(); + if (type_id == Type::FIXED_SIZE_LIST) { + fsl = checked_cast(type); + list_size *= fsl->list_size(); + type = fsl->value_type().get(); + continue; + } + if (is_fixed_width(type_id)) { + const int64_t flat_bit_width = list_size * type->bit_width(); + DCHECK_GE(flat_bit_width, 0); + return flat_bit_width; + } + break; + } + return -1; +} + int64_t FixedWidthInBits(const DataType& type) { auto type_id = type.id(); if (is_fixed_width(type_id)) { return type.bit_width(); } - const int64_t byte_width = FixedWidthInBytes(type); - if (ARROW_PREDICT_FALSE(byte_width < 0)) { - return -1; + if (type_id == Type::FIXED_SIZE_LIST) { + auto& fsl = ::arrow::internal::checked_cast(type); + return FixedWidthInBitsFallback(fsl); } - return byte_width * 8; + return -1; } namespace internal { @@ -121,9 +143,6 @@ Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, if (type->id() == Type::FIXED_SIZE_LIST) { auto& fsl_type = checked_cast(*type); auto& value_type = fsl_type.value_type(); - if (ARROW_PREDICT_FALSE(value_type->id() == Type::BOOL)) { - return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", fsl_type); - } if (ARROW_PREDICT_FALSE(value_type->id() == Type::DICTIONARY)) { return Status::NotImplemented( "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); @@ -146,16 +165,13 @@ Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, } // namespace internal -/// \pre same as OffsetPointerOfFixedWidthValues -/// \pre source.type->id() != Type::BOOL -static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& source) { +std::pair OffsetPointerOfFixedBitWidthValues( + const ArraySpan& source) { using OffsetAndListSize = std::pair; auto get_offset = [](auto pair) { return pair.first; }; auto get_list_size = [](auto pair) { return pair.second; }; ::arrow::internal::SmallVector stack; - DCHECK_NE(source.type->id(), Type::BOOL); - int64_t list_size = 1; auto* array = &source; while (array->type->id() == Type::FIXED_SIZE_LIST) { @@ -166,31 +182,25 @@ static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& s // Now that innermost values were reached, pop the stack and calculate the offset // in bytes of the innermost values buffer by considering the offset at each // level of nesting. - DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); + DCHECK(is_fixed_width(*array->type)); DCHECK(array == &source || !array->MayHaveNulls()) << "OffsetPointerOfFixedWidthValues: array is expected to be flat or have no " "nulls in the arrays nested by FIXED_SIZE_LIST."; - int64_t value_width = array->type->byte_width(); - int64_t offset_in_bytes = array->offset * value_width; + int64_t value_width_in_bits = array->type->bit_width(); + int64_t offset_in_bits = array->offset * value_width_in_bits; for (auto it = stack.rbegin(); it != stack.rend(); ++it) { - value_width *= get_list_size(*it); - offset_in_bytes += get_offset(*it) * value_width; + value_width_in_bits *= get_list_size(*it); + offset_in_bits += get_offset(*it) * value_width_in_bits; } - return value_width < 0 ? nullptr : array->GetValues(1, offset_in_bytes); + DCHECK_GE(value_width_in_bits, 0); + const auto* values_ptr = array->GetValues(1, 0); + return {static_cast(offset_in_bits % 8), values_ptr + (offset_in_bits / 8)}; } -const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) { - auto type_id = source.type->id(); - if (is_fixed_width(type_id)) { - if (ARROW_PREDICT_FALSE(type_id == Type::BOOL)) { - // BOOL arrays are bit-packed, thus a byte-aligned pointer cannot be produced in the - // general case. Returning something for BOOL arrays that happen to byte-align - // because offset=0 would create too much confusion. - return nullptr; - } - return source.GetValues(1, 0) + source.offset * source.type->byte_width(); - } - return OffsetPointerOfFixedWidthValuesFallback(source); +const uint8_t* OffsetPointerOfFixedByteWidthValues(const ArraySpan& source) { + DCHECK(IsFixedWidthLike(source, /*force_null_count=*/false, + [](const DataType& type) { return type.id() != Type::BOOL; })); + return OffsetPointerOfFixedBitWidthValues(source).second; } /// \brief Get the mutable pointer to the fixed-width values of an array @@ -203,24 +213,20 @@ const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) { /// \return The mutable pointer to the fixed-width byte blocks of the array. If /// pre-conditions are not satisfied, the return values is undefined. uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array) { - auto type_id = mutable_array->type->id(); - if (type_id == Type::FIXED_SIZE_LIST) { - auto* array = mutable_array; - do { - DCHECK_EQ(array->offset, 0); - DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of " - << mutable_array->type->ToString(true); - array = array->child_data[0].get(); - } while (array->type->id() == Type::FIXED_SIZE_LIST); + auto* array = mutable_array; + auto type_id = array->type->id(); + while (type_id == Type::FIXED_SIZE_LIST) { DCHECK_EQ(array->offset, 0); - DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); - return array->GetMutableValues(1, 0); + DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of " + << mutable_array->type->ToString(true); + array = array->child_data[0].get(); + type_id = array->type->id(); } DCHECK_EQ(mutable_array->offset, 0); // BOOL is allowed here only because the offset is expected to be 0, // so the byte-aligned pointer also points to the first *bit* of the buffer. DCHECK(is_fixed_width(type_id)); - return mutable_array->GetMutableValues(1, 0); + return array->GetMutableValues(1, 0); } } // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_internal.h b/cpp/src/arrow/util/fixed_width_internal.h index f6959485fbd01..232411f4c4a56 100644 --- a/cpp/src/arrow/util/fixed_width_internal.h +++ b/cpp/src/arrow/util/fixed_width_internal.h @@ -56,146 +56,140 @@ namespace arrow::util { /// Additionally, we say that a type is "fixed-width like" if it's a fixed-width as /// defined above, or if it's a fixed-size list (or nested fixed-size lists) and /// the innermost type is fixed-width and the following restrictions also apply: -/// - The value type of the innermost fixed-size list is not BOOL (it has to be excluded -/// because a 1-bit type doesn't byte-align) /// - Only the top-level array may have nulls, all the inner array have to be completely /// free of nulls so we don't need to manage internal validity bitmaps. /// -/// Take the following `fixed_size_list, 3>` array as an -/// example: -/// -/// [ -/// [[1, 2], [3, 4], [ 5, 6]], -/// null, -/// [[7, 8], [9, 10], [11, 12]] -/// ] -/// -/// in memory, it would look like: -/// -/// { -/// type: fixed_size_list, 3>, -/// length: 3, -/// null_count: 1, -/// offset: 0, -/// buffers: [ -/// 0: [0b00000101] -/// ], -/// child_data: [ -/// 0: { -/// type: fixed_size_list, -/// length: 9, -/// null_count: 0, -/// offset: 0, -/// buffers: [0: NULL], -/// child_data: [ -/// 0: { -/// type: int32, -/// length: 18, -/// null_count: 0, -/// offset: 0, -/// buffers: [ -/// 0: NULL, -/// 1: [ 1, 2, 3, 4, 5, 6, -/// 0, 0, 0, 0, 0, 0 -/// 7, 8, 9, 10, 11, 12 ] -/// ], -/// child_data: [] -/// } -/// ] -/// } -/// ] -/// } -/// -/// This layout fits the fixed-width like definition because the innermost type -/// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't -/// have nulls. The validity bitmap is only needed at the top-level array. -/// -/// Writing to this array can be done in the same way writing to a flat fixed-width -/// array is done, by: -/// 1. Updating the validity bitmap at the top-level array if nulls are present. -/// 2. Updating a continuous fixed-width block of memory through a single pointer. -/// -/// The length of this block of memory is the product of the list sizes in the -/// `FixedSizeList` types and the byte width of the innermost fixed-width type: -/// -/// 3 * 2 * 4 = 24 bytes -/// -/// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by -/// simply setting the validity bit to 1 and writing the 24-byte sequence of -/// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. -/// -/// The length of the top-level array fully defines the lengths that all the nested -/// arrays must have, which makes defining all the lengths as easy as defining the -/// length of the top-level array. -/// -/// length = 3 -/// child_data[0].length == 3 * 3 == 9 -/// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 -/// -/// child_data[0].child_data[0].buffers[1].size() >= -/// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) -/// -/// Dealing with offsets is a bit involved. Let's say the array described above has -/// the offsets 2, 5, and 7: -/// -/// { -/// type: fixed_size_list, 3>, -/// offset: 2, -/// ... -/// child_data: [ -/// 0: { -/// type: fixed_size_list, -/// offset: 5, -/// ... -/// child_data: [ -/// 0: { -/// type: int32, -/// offset: 7, -/// buffers: [ -/// 0: NULL, -/// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped -/// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped -/// -/// 0,0,0,0,0,1, // -/// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped -/// -/// 1, 2, 3, 4, 5, 6, // -/// 0, 0, 0, 0, 0, 0 // the actual values -/// 7, 8, 9, 10, 11, 12 // -/// ] -/// ], -/// } -/// ] -/// } -/// ] -/// } -/// -/// The offset of the innermost values buffer, in bytes, is calculated as: -/// -/// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes -/// -/// In general, the formula to calculate the offset of the innermost values buffer is: -/// -/// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) -/// * sizeof(innermost_type) -/// -/// `OffsetPointerOfFixedWidthValues()` can calculate this byte offset and return the -/// pointer to the first relevant byte of the innermost values buffer. -/// /// \param source The array to check /// \param force_null_count If true, GetNullCount() is used instead of null_count -/// \param exclude_dictionary If true, DICTIONARY is excluded from the -/// is_fixed_width() types. Default: false. +/// \param exclude_bool_and_dictionary If true, BOOL and DICTIONARY are excluded from +/// the is_fixed_width() types. Default: false. ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count = false, - bool exclude_dictionary = false); + bool exclude_bool_and_dictionary = false); + +// Take the following `fixed_size_list, 3>` array as an +// example: +// +// [ +// [[1, 2], [3, 4], [ 5, 6]], +// null, +// [[7, 8], [9, 10], [11, 12]] +// ] +// +// in memory, it would look like: +// +// { +// type: fixed_size_list, 3>, +// length: 3, +// null_count: 1, +// offset: 0, +// buffers: [ +// 0: [0b00000101] +// ], +// child_data: [ +// 0: { +// type: fixed_size_list, +// length: 9, +// null_count: 0, +// offset: 0, +// buffers: [0: NULL], +// child_data: [ +// 0: { +// type: int32, +// length: 18, +// null_count: 0, +// offset: 0, +// buffers: [ +// 0: NULL, +// 1: [ 1, 2, 3, 4, 5, 6, +// 0, 0, 0, 0, 0, 0 +// 7, 8, 9, 10, 11, 12 ] +// ], +// child_data: [] +// } +// ] +// } +// ] +// } +// +// This layout fits the fixed-width like definition because the innermost type +// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't +// have nulls. The validity bitmap is only needed at the top-level array. +// +// Writing to this array can be done in the same way writing to a flat fixed-width +// array is done, by: +// 1. Updating the validity bitmap at the top-level array if nulls are present. +// 2. Updating a continuous fixed-width block of memory through a single pointer. +// +// The length of this block of memory is the product of the list sizes in the +// `FixedSizeList` types and the byte width of the innermost fixed-width type: +// +// 3 * 2 * 4 = 24 bytes +// +// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by +// simply setting the validity bit to 1 and writing the 24-byte sequence of +// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. +// +// The length of the top-level array fully defines the lengths that all the nested +// arrays must have, which makes defining all the lengths as easy as defining the +// length of the top-level array. +// +// length = 3 +// child_data[0].length == 3 * 3 == 9 +// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 +// +// child_data[0].child_data[0].buffers[1].size() >= +// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) +// +// Dealing with offsets is a bit involved. Let's say the array described above has +// the offsets 2, 5, and 7: +// +// { +// type: fixed_size_list, 3>, +// offset: 2, +// ... +// child_data: [ +// 0: { +// type: fixed_size_list, +// offset: 5, +// ... +// child_data: [ +// 0: { +// type: int32, +// offset: 7, +// buffers: [ +// 0: NULL, +// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped +// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped +// +// 0,0,0,0,0,1, // +// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped +// +// 1, 2, 3, 4, 5, 6, // +// 0, 0, 0, 0, 0, 0 // the actual values +// 7, 8, 9, 10, 11, 12 // +// ] +// ], +// } +// ] +// } +// ] +// } +// +// The offset of the innermost values buffer, in bytes, is calculated as: +// +// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes +// +// In general, the formula to calculate the offset of the innermost values buffer is: +// +// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) +// * sizeof(innermost_type) +// +// `OffsetPointerOfFixedByteWidthValues()` can calculate this byte offset and return +// the pointer to the first relevant byte of the innermost values buffer. /// \brief Checks if the given array has a fixed-width type or if it's an array of /// fixed-size list that can be flattened to an array of fixed-width values. /// -/// This function is a more general version of -/// `IsFixedWidthLike(const ArraySpan&, bool)` that allows the caller to further -/// restrict the inner value types that should be considered fixed-width. -/// /// \param source The array to check /// \param force_null_count If true, GetNullCount() is used instead of null_count /// \param extra_predicate A DataType predicate that can be used to further @@ -217,9 +211,7 @@ inline bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, values = &values->child_data[0]; continue; } - // BOOL has to be excluded because it's not byte-aligned. - return type->id() != Type::BOOL && is_fixed_width(type->id()) && - extra_predicate(*type); + return is_fixed_width(type->id()) && extra_predicate(*type); } } return false; @@ -251,6 +243,10 @@ ARROW_EXPORT int64_t FixedWidthInBytes(const DataType& type); /// \brief Get the fixed-width in bits of a type if it is a fixed-width like /// type. /// +/// If the array is a FixedSizeList (of any level of nesting), the bit width of +/// the values is the product of all fixed-list sizes and the bit width of the +/// innermost fixed-width value type. +/// /// \return The bit-width of the values or -1 /// \see FixedWidthInBytes ARROW_EXPORT int64_t FixedWidthInBits(const DataType& type); @@ -260,7 +256,7 @@ namespace internal { /// \brief Allocate an ArrayData for a type that is fixed-width like. /// /// This function performs the same checks performed by -/// `IsFixedWidthLike(source, false)`. If `source.type` is not a simple +/// `IsFixedWidthLike(source, false, false)`. If `source.type` is not a simple /// fixed-width type, caller should make sure it passes the /// `IsFixedWidthLike(source)` checks. That guarantees that it's possible to /// allocate an array that can serve as a destination for a kernel that writes values @@ -280,18 +276,24 @@ ARROW_EXPORT Status PreallocateFixedWidthArrayData(::arrow::compute::KernelConte } // namespace internal -/// \brief Get the pointer to the fixed-width values of a fixed-width like array. +/// \brief Get the 0-7 residual offset in bits and the pointer to the fixed-width +/// values of a fixed-width like array. /// -/// This function might return NULLPTR if the type of the array is BOOL or -/// if the pre-conditions listed are not satisfied. The converse is not true -/// (i.e. not getting NULLPTR doesn't guarantee that source is a fixed-width -/// like array). +/// For byte-aligned types, the offset is always 0. /// /// \pre `IsFixedWidthLike(source)` or the more restrictive /// is_fixed_width(*mutable_array->type) SHOULD be true -/// \return The pointer to the fixed-width values of an array or NULLPTR -/// if pre-conditions are not satisfied. -ARROW_EXPORT const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source); +/// \return A pair with the residual offset in bits (0-7) and the pointer +/// to the fixed-width values. +ARROW_EXPORT std::pair OffsetPointerOfFixedBitWidthValues( + const ArraySpan& source); + +/// \brief Get the pointer to the fixed-width values of a fixed-width like array. +/// +/// \pre `IsFixedWidthLike(source)` should be true and BOOL should be excluded +/// as each bool is 1-bit width making it impossible to produce a +/// byte-aligned pointer to the values in the general case. +ARROW_EXPORT const uint8_t* OffsetPointerOfFixedByteWidthValues(const ArraySpan& source); /// \brief Get the mutable pointer to the fixed-width values of an array /// allocated by PreallocateFixedWidthArrayData. diff --git a/cpp/src/arrow/util/fixed_width_test.cc b/cpp/src/arrow/util/fixed_width_test.cc index 2f05221ed6535..3b35de1b6bbeb 100644 --- a/cpp/src/arrow/util/fixed_width_test.cc +++ b/cpp/src/arrow/util/fixed_width_test.cc @@ -80,10 +80,7 @@ TEST_F(TestFixedWidth, IsFixedWidth) { TEST_F(TestFixedWidth, IsFixedWidthLike) { auto arr = ArraySpan{*fsl_bool_array_->data()}; - // bools wrapped by fixed-size-list are not fixed-width because the - // innermost data buffer is a bitmap and won't byte-align. - ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false)); - ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); arr = ArraySpan{*fsl_int_array_->data()}; ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); @@ -114,12 +111,12 @@ TEST_F(TestFixedWidth, IsFixedWidthLike) { arr = ArraySpan{*dict_string_array_->data()}; // Dictionaries are considered fixed-width by is_fixed_width(), but excluded - // by IsFixedWidthLike if exclude_dictionary=true. + // by IsFixedWidthLike if exclude_bool_and_dictionary=true. ASSERT_TRUE(IsFixedWidthLike(arr)); - ASSERT_TRUE( - IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/false)); - ASSERT_FALSE( - IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/true)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, + /*exclude_bool_and_dictionary=*/false)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false, + /*exclude_bool_and_dictionary=*/true)); } TEST_F(TestFixedWidth, MeasureWidthInBytes) { @@ -184,9 +181,9 @@ TEST_F(TestFixedWidth, MeasureWidthInBits) { ASSERT_EQ(FixedWidthInBits(*varlen), -1); ASSERT_EQ(FixedWidthInBits(*varlen), -1); - ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), -1); - ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), -1); - ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), 3); + ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), 5); ASSERT_EQ(FixedWidthInBits(*fsl(0, i8)), 0); ASSERT_EQ(FixedWidthInBits(*fsl(3, i8)), 3 * 8); From 2ca9ad2861387a08244427eb1a2457c32a8ed31a Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Wed, 15 May 2024 10:00:48 -0400 Subject: [PATCH 095/105] GH-41653: [MATLAB] Add new `arrow.c.Array` MATLAB class which wraps a C Data Interface format `ArrowArray` C struct (#41655) ### Rationale for this change Now that the MATLAB interface has support for `arrow.tabular.RecordBatch` and `arrow.array.Array`, we should add support for the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) format. The C Data Interface is based around two C struct definitions: (1) `ArrowArray` and (2) `ArrowSchema`. We should start by adding a new MATLAB class (e.g. `arrow.c.Array`) which wraps the underlying `ArrowArray` C struct. Later, we can add another new MATLAB class (e.g. `arrow.c.Schema`) which wraps the `ArrowSchema` C struct. Once we have added these two MATLAB classes, we can then add import and export functionality to share the Arrow memory between multiple language runtimes running in the same process. This would help enable workflows like sharing Arrow data between the MATLAB Interface to Arrow and `pyarrow` running within the MATLAB process via the [MATLAB interface to Python](https://www.mathworks.com/help/matlab/call-python-libraries.html)). ### What changes are included in this PR? 1. Added a new C++ proxy class called `arrow::matlab::c::proxy::Array` which wraps an `ArrowArray` `struct` pointer. This class is registered as the proxy `arrow.c.proxy.Array` in order to make it accessible to MATLAB. 2. Added a new MATLAB class called `arrow.c.Array` that has an `arrow.c.proxy.Array` instance. It has one public property named `Address`, which is a scalar `uint64`. This property is the memory address of the `ArrowArray` `struct` pointer owned by `arrow.c.proxy.Array`. ### Are these changes tested? Yes. 1. Added a new test class called `test/arrow/c/tArray.m`. 2. @ kevingurney and I created a prototype for importing and exporting arrow `Array`s via the C Data Interface format [here](https://github.com/mathworks/arrow/tree/arrow-array-address). We were able to share arrow `Array`s and `RecordBatch`es between mlarrow and pyarrow. Our plan now is to submit the necessary MATLAB code incrementally. ### Are there any user-facing changes? Yes. The `arrow.c.Array` class is user-facing. However, it's only intended for "advanced" use-cases. In the future, we may add higher-level functionality on top of the C Data Interface so that users don't need to interact with it directly. **NOTE:** On destruction, `arrow.c.proxy.Array` will check to see if the `ArrowArray` has already been consumed by an importer. If not, `arrow.c.proxy.Array`'s destructor will call the `release` callback on the `ArrowArray` to avoid memory leaks. To the best of our knowledge, this is similar to the how the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) works. ### Future Directions 1. #41654 2. #41656 * GitHub Issue: #41653 Lead-authored-by: Sarah Gilmore Co-authored-by: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Co-authored-by: Kevin Gurney Signed-off-by: Sarah Gilmore --- matlab/src/cpp/arrow/matlab/c/proxy/array.cc | 49 +++++++++++++++++++ matlab/src/cpp/arrow/matlab/c/proxy/array.h | 41 ++++++++++++++++ matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + matlab/src/matlab/+arrow/+c/Array.m | 37 ++++++++++++++ matlab/test/arrow/c/tArray.m | 48 ++++++++++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 3 +- 6 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 matlab/src/cpp/arrow/matlab/c/proxy/array.cc create mode 100644 matlab/src/cpp/arrow/matlab/c/proxy/array.h create mode 100644 matlab/src/matlab/+arrow/+c/Array.m create mode 100644 matlab/test/arrow/c/tArray.m diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/array.cc b/matlab/src/cpp/arrow/matlab/c/proxy/array.cc new file mode 100644 index 0000000000000..a5f3418f1bcfa --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/array.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/c/abi.h" + +#include "arrow/matlab/c/proxy/array.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +Array::Array() : arrowArray{} { REGISTER_METHOD(Array, getAddress); } + +Array::~Array() { + if (arrowArray.release != NULL) { + arrowArray.release(&arrowArray); + arrowArray.release = NULL; + } +} + +libmexclass::proxy::MakeResult Array::make( + const libmexclass::proxy::FunctionArguments& constructor_arguments) { + return std::make_shared(); +} + +void Array::getAddress(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + + mda::ArrayFactory factory; + auto address = reinterpret_cast(&arrowArray); + context.outputs[0] = factory.createScalar(address); +} + +} // namespace arrow::matlab::c::proxy \ No newline at end of file diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/array.h b/matlab/src/cpp/arrow/matlab/c/proxy/array.h new file mode 100644 index 0000000000000..b42b2dcd9cfa8 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/array.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/c/abi.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +class Array : public libmexclass::proxy::Proxy { + public: + Array(); + + ~Array(); + + static libmexclass::proxy::MakeResult make( + const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void getAddress(libmexclass::proxy::method::Context& context); + + struct ArrowArray arrowArray; + + // struct ArrowArray* arrowArray; +}; + +} // namespace arrow::matlab::c::proxy \ No newline at end of file diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 23492f75deacc..cf13ed6aa57fa 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -25,6 +25,7 @@ #include "arrow/matlab/array/proxy/time64_array.h" #include "arrow/matlab/array/proxy/timestamp_array.h" #include "arrow/matlab/buffer/proxy/buffer.h" +#include "arrow/matlab/c/proxy/array.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/io/csv/proxy/table_reader.h" #include "arrow/matlab/io/csv/proxy/table_writer.h" @@ -99,6 +100,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy( REGISTER_PROXY(arrow.io.feather.proxy.Reader , arrow::matlab::io::feather::proxy::Reader); REGISTER_PROXY(arrow.io.csv.proxy.TableWriter , arrow::matlab::io::csv::proxy::TableWriter); REGISTER_PROXY(arrow.io.csv.proxy.TableReader , arrow::matlab::io::csv::proxy::TableReader); + REGISTER_PROXY(arrow.c.proxy.Array , arrow::matlab::c::proxy::Array); // clang-format on return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, diff --git a/matlab/src/matlab/+arrow/+c/Array.m b/matlab/src/matlab/+arrow/+c/Array.m new file mode 100644 index 0000000000000..574fca9afebd8 --- /dev/null +++ b/matlab/src/matlab/+arrow/+c/Array.m @@ -0,0 +1,37 @@ +%ARRAY Wrapper for an Arrow C Data Interface format ArrowArray C struct pointer. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef Array < matlab.mixin.Scalar + + properties (Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent, GetAccess=public, SetAccess=private) + Address(1, 1) uint64 + end + + methods + function obj = Array() + proxyName = "arrow.c.proxy.Array"; + obj.Proxy = arrow.internal.proxy.create(proxyName); + end + + function address = get.Address(obj) + address = obj.Proxy.getAddress(); + end + end +end \ No newline at end of file diff --git a/matlab/test/arrow/c/tArray.m b/matlab/test/arrow/c/tArray.m new file mode 100644 index 0000000000000..f8caf48065114 --- /dev/null +++ b/matlab/test/arrow/c/tArray.m @@ -0,0 +1,48 @@ +%TARRAY Defines unit tests for arrow.c.Array. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tArray < matlab.unittest.TestCase + + methods (Test) + function TestClassStructure(testCase) + array = arrow.c.Array(); + + % Verify array is an instance of arrow.c.Array. + testCase.verifyInstanceOf(array, "arrow.c.Array"); + + % Verify array has one public property named Address. + props = properties(array); + testCase.verifyEqual(props, {'Address'}); + end + + function TestAddressProperty(testCase) + array = arrow.c.Array(); + + % It's impossible to know what the value of Address will be. + % Just verify Address is a scalar uint64. + address = array.Address; + testCase.verifyInstanceOf(address, "uint64"); + testCase.verifyTrue(isscalar(address)); + end + + function TestAddressNoSetter(testCase) + % Verify the Address property is read-only. + array = arrow.c.Array(); + fcn = @() setfield(array, "Address", uint64(10)); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + end +end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index e1641842ca8b9..7a8cf8f40358b 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -75,7 +75,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/index/validate.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer/proxy/buffer.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer/proxy/buffer.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/array.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") From 82045527b775d3847d3f34ebb51af852c76a2e44 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 15 May 2024 10:49:29 -0400 Subject: [PATCH 096/105] GH-41654: [MATLAB] Add new `arrow.c.Schema` MATLAB class which wraps a C Data Interface format `ArrowSchema` C struct (#41674) ### Rationale for this change Now that the MATLAB interface has support for `arrow.tabular.RecordBatch` and `arrow.array.Array`, we should add support for the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) format. The C Data Interface is based around two C struct definitions: (1) `ArrowArray` and (2) `ArrowSchema`. Now that #41653 (add support for `arrow.c.Array`) has been addressed, we should add another new MATLAB class (e.g. `arrow.c.Schema`) which wraps the underlying `ArrowSchema` C struct. Once we have added these two MATLAB classes, we can then add import and export functionality to share the Arrow memory between multiple language runtimes running in the same process. This would help enable workflows like sharing Arrow data between the MATLAB Interface to Arrow and `pyarrow` running within the MATLAB process via the [MATLAB interface to Python](https://www.mathworks.com/help/matlab/call-python-libraries.html)). ### What changes are included in this PR? 1. Added a new C++ proxy class called `arrow::matlab::c::proxy::Schema` which wraps an `ArrowSchema` struct pointer. This class is registered as the proxy `arrow.c.proxy.Schema` in order to make it accessible to MATLAB. 2. Added a new MATLAB class called `arrow.c.Schema` that has an `arrow.c.proxy.Schema` instance. It has one public property named `Address`, which is a scalar `uint64`. This property is the memory address of the `ArrowSchema` struct pointer owned by `arrow.c.proxy.Schema`. ### Are these changes tested? Yes. 1. Added a new test class called `test/arrow/c/tSchema.m`. 2. @ sgilmore10 and I created a prototype for importing and exporting arrow `Array`s via the C Data Interface format [here](https://github.com/mathworks/arrow/tree/arrow-array-address). We were able to share arrow `Array`s and `RecordBatch`s between `mlarrow` and `pyarrow`. Our plan now is to submit the necessary MATLAB code incrementally. ### Are there any user-facing changes? Yes. 1. The `arrow.c.Schema` class is user-facing. However, it's only intended for "advanced" use-cases. In the future, we may add higher-level functionality on top of the C Data Interface so that users don't need to interact with it directly. 2. **NOTE**: On destruction, `arrow.c.proxy.Schema` will check to see if the `ArrowSchema` has already been consumed by an importer. If not, `arrow.c.proxy.Schema`'s destructor will call the release callback on the `ArrowSchema` to avoid memory leaks. To the best of our knowledge, this is similar to the how the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) works. ### Future Directions 1. #41656 2. We should probably follow up with a PR to create shared infrastructure for `arrow.c.Array` and `arrow.c.Schema`, since they are almost identical in design and implementation. ### Notes 1. Thank you @ sgilmore10 for your help with this pull request! * GitHub Issue: #41654 Authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- matlab/src/cpp/arrow/matlab/c/proxy/array.h | 4 +- matlab/src/cpp/arrow/matlab/c/proxy/schema.cc | 49 +++++++++++++++++++ matlab/src/cpp/arrow/matlab/c/proxy/schema.h | 39 +++++++++++++++ matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + matlab/src/matlab/+arrow/+c/Schema.m | 37 ++++++++++++++ matlab/test/arrow/c/tSchema.m | 48 ++++++++++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 3 +- 7 files changed, 178 insertions(+), 4 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/c/proxy/schema.cc create mode 100644 matlab/src/cpp/arrow/matlab/c/proxy/schema.h create mode 100644 matlab/src/matlab/+arrow/+c/Schema.m create mode 100644 matlab/test/arrow/c/tSchema.m diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/array.h b/matlab/src/cpp/arrow/matlab/c/proxy/array.h index b42b2dcd9cfa8..bb35807fcd015 100644 --- a/matlab/src/cpp/arrow/matlab/c/proxy/array.h +++ b/matlab/src/cpp/arrow/matlab/c/proxy/array.h @@ -34,8 +34,6 @@ class Array : public libmexclass::proxy::Proxy { void getAddress(libmexclass::proxy::method::Context& context); struct ArrowArray arrowArray; - - // struct ArrowArray* arrowArray; }; -} // namespace arrow::matlab::c::proxy \ No newline at end of file +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/schema.cc b/matlab/src/cpp/arrow/matlab/c/proxy/schema.cc new file mode 100644 index 0000000000000..7f239f5628720 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/schema.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/c/abi.h" + +#include "arrow/matlab/c/proxy/schema.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +Schema::Schema() : arrowSchema{} { REGISTER_METHOD(Schema, getAddress); } + +Schema::~Schema() { + if (arrowSchema.release != NULL) { + arrowSchema.release(&arrowSchema); + arrowSchema.release = NULL; + } +} + +libmexclass::proxy::MakeResult Schema::make( + const libmexclass::proxy::FunctionArguments& constructor_arguments) { + return std::make_shared(); +} + +void Schema::getAddress(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + + mda::ArrayFactory factory; + auto address = reinterpret_cast(&arrowSchema); + context.outputs[0] = factory.createScalar(address); +} + +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/schema.h b/matlab/src/cpp/arrow/matlab/c/proxy/schema.h new file mode 100644 index 0000000000000..8f781ea9c7341 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/schema.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/c/abi.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +class Schema : public libmexclass::proxy::Proxy { + public: + Schema(); + + ~Schema(); + + static libmexclass::proxy::MakeResult make( + const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void getAddress(libmexclass::proxy::method::Context& context); + + struct ArrowSchema arrowSchema; +}; + +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index cf13ed6aa57fa..d7a8fa9ac2e74 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -26,6 +26,7 @@ #include "arrow/matlab/array/proxy/timestamp_array.h" #include "arrow/matlab/buffer/proxy/buffer.h" #include "arrow/matlab/c/proxy/array.h" +#include "arrow/matlab/c/proxy/schema.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/io/csv/proxy/table_reader.h" #include "arrow/matlab/io/csv/proxy/table_writer.h" @@ -101,6 +102,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy( REGISTER_PROXY(arrow.io.csv.proxy.TableWriter , arrow::matlab::io::csv::proxy::TableWriter); REGISTER_PROXY(arrow.io.csv.proxy.TableReader , arrow::matlab::io::csv::proxy::TableReader); REGISTER_PROXY(arrow.c.proxy.Array , arrow::matlab::c::proxy::Array); + REGISTER_PROXY(arrow.c.proxy.Schema , arrow::matlab::c::proxy::Schema); // clang-format on return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, diff --git a/matlab/src/matlab/+arrow/+c/Schema.m b/matlab/src/matlab/+arrow/+c/Schema.m new file mode 100644 index 0000000000000..29eba59016044 --- /dev/null +++ b/matlab/src/matlab/+arrow/+c/Schema.m @@ -0,0 +1,37 @@ +%SCHEMA Wrapper for an Arrow C Data Interface format ArrowSchema C struct pointer. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef Schema < matlab.mixin.Scalar + + properties (Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent, GetAccess=public, SetAccess=private) + Address(1, 1) uint64 + end + + methods + function obj = Schema() + proxyName = "arrow.c.proxy.Schema"; + obj.Proxy = arrow.internal.proxy.create(proxyName); + end + + function address = get.Address(obj) + address = obj.Proxy.getAddress(); + end + end +end \ No newline at end of file diff --git a/matlab/test/arrow/c/tSchema.m b/matlab/test/arrow/c/tSchema.m new file mode 100644 index 0000000000000..16dcf1965b463 --- /dev/null +++ b/matlab/test/arrow/c/tSchema.m @@ -0,0 +1,48 @@ +%TSCHEMA Defines unit tests for arrow.c.Schema. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tSchema < matlab.unittest.TestCase + + methods (Test) + function TestClassStructure(testCase) + schema = arrow.c.Schema(); + + % Verify schema is an instance of arrow.c.Schema. + testCase.verifyInstanceOf(schema, "arrow.c.Schema"); + + % Verify schema has one public property named Address. + props = properties(schema); + testCase.verifyEqual(props, {'Address'}); + end + + function TestAddressProperty(testCase) + schema = arrow.c.Schema(); + + % It's impossible to know what the value of Address will be. + % Just verify Address is a scalar uint64. + address = schema.Address; + testCase.verifyInstanceOf(address, "uint64"); + testCase.verifyTrue(isscalar(address)); + end + + function TestAddressNoSetter(testCase) + % Verify the Address property is read-only. + schema = arrow.c.Schema(); + fcn = @() setfield(schema, "Address", uint64(10)); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + end +end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 7a8cf8f40358b..8f37bef77b859 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -76,7 +76,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/index/validate.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer/proxy/buffer.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/array.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/array.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/schema.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") From edd62f75326c86edc22e705e13b0674acd7cc1c1 Mon Sep 17 00:00:00 2001 From: h-vetinari Date: Thu, 16 May 2024 02:07:45 +1100 Subject: [PATCH 097/105] GH-41581: [C++][CMake] correctly use Protobuf_PROTOC_EXECUTABLE (#41582) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #41581 * GitHub Issue: #41581 Lead-authored-by: H. Vetinari Co-authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- cpp/cmake_modules/FindProtobufAlt.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/cmake_modules/FindProtobufAlt.cmake b/cpp/cmake_modules/FindProtobufAlt.cmake index f343b42f2b762..703e05c4731b6 100644 --- a/cpp/cmake_modules/FindProtobufAlt.cmake +++ b/cpp/cmake_modules/FindProtobufAlt.cmake @@ -31,6 +31,11 @@ endif() find_package(protobuf CONFIG ${find_package_args}) set(ProtobufAlt_FOUND ${protobuf_FOUND}) if(ProtobufAlt_FOUND) + if(Protobuf_PROTOC_EXECUTABLE) + # work around https://github.com/protocolbuffers/protobuf/issues/14576 + set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION_RELEASE + "${Protobuf_PROTOC_EXECUTABLE}") + endif() set(ProtobufAlt_VERSION ${protobuf_VERSION}) set(ProtobufAlt_VERSION_MAJOR ${protobuf_VERSION_MAJOR}) set(ProtobufAlt_VERSION_MINOR ${protobuf_VERSION_MINOR}) From 63fddd7b2f12fb65ed5feff820a1913931773968 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 16 May 2024 00:54:12 +0900 Subject: [PATCH 098/105] GH-41660: [CI][Java] Restore devtoolset relatead GANDIVA_CXX_FLAGS (#41661) ### Rationale for this change Because #41451 removed devtoolset related flags unexpectedly. ### What changes are included in this PR? Restore devtoolset related flags. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41660 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ci/scripts/java_jni_manylinux_build.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh index 4921ce170b7a9..6f3769751af42 100755 --- a/ci/scripts/java_jni_manylinux_build.sh +++ b/ci/scripts/java_jni_manylinux_build.sh @@ -35,6 +35,9 @@ echo "=== Clear output directories and leftovers ===" rm -rf ${build_dir} echo "=== Building Arrow C++ libraries ===" +devtoolset_version=$(rpm -qa "devtoolset-*-gcc" --queryformat %{VERSION} | \ + grep -o "^[0-9]*") +devtoolset_include_cpp="/opt/rh/devtoolset-${devtoolset_version}/root/usr/include/c++/${devtoolset_version}" : ${ARROW_ACERO:=ON} export ARROW_ACERO : ${ARROW_BUILD_TESTS:=ON} @@ -55,7 +58,7 @@ export ARROW_ORC : ${VCPKG_ROOT:=/opt/vcpkg} : ${VCPKG_FEATURE_FLAGS:=-manifests} : ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} -: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-isystem;-lpthread} +: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-lpthread} if [ "${ARROW_USE_CCACHE}" == "ON" ]; then echo "=== ccache statistics before build ===" From e1de9c52d5a60b2e2a314b8589170467fe36415d Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 15 May 2024 12:35:04 -0400 Subject: [PATCH 099/105] GH-41541: [Go][Parquet] Fix writer performance regression (#41638) ### Rationale for this change A performance regression was reported for the parquet writer since v14. Profiling revealed excessive allocations. This was due to us always adding the current offset to the current capacity when reserving, resulting in Reserve always performing a reallocate even when it didn't need to. ### What changes are included in this PR? `PooledBufferWriter` should only pass `nbytes` to the `Reserve` call, not `byteoffset + nbytes`. `BitWriter` should not be adding `b.offset` to the capacity when determining the new capacity. ### Are these changes tested? Yes. ### Are there any user-facing changes? No, only performance changes: Before: ```shell goos: linux goarch: amd64 pkg: github.com/apache/arrow/go/v17/parquet/pqarrow cpu: 12th Gen Intel(R) Core(TM) i7-12700H BenchmarkWriteColumn/int32_not_nullable-20 514 2127175 ns/op 1971.77 MB/s 5425676 B/op 239 allocs/op BenchmarkWriteColumn/int32_nullable-20 31 467352621 ns/op 8.97 MB/s 2210271923 B/op2350 allocs/op BenchmarkWriteColumn/int64_not_nullable-20 326 4132204 ns/op 2030.06 MB/s 5442976 B/op 265 allocs/op BenchmarkWriteColumn/int64_nullable-20 33 432764687 ns/op 19.38 MB/s 2100068812 B/op2384 allocs/op BenchmarkWriteColumn/float32_not_nullable-20 334 3540566 ns/op 1184.64 MB/s 5453079 B/op 1263 allocs/op BenchmarkWriteColumn/float32_nullable-20 6 492103646 ns/op 8.52 MB/s 2283305841 B/op3371 allocs/op BenchmarkWriteColumn/float64_not_nullable-20 241 4783268 ns/op 1753.74 MB/s 5498759 B/op 1292 allocs/op BenchmarkWriteColumn/float64_nullable-20 4 369619096 ns/op 22.70 MB/s 1725354454 B/op3401 allocs/op PASS ok github.com/apache/arrow/go/v17/parquet/pqarrow 40.862s ``` After: ```shell goos: linux goarch: amd64 pkg: github.com/apache/arrow/go/v17/parquet/pqarrow cpu: 12th Gen Intel(R) Core(TM) i7-12700H BenchmarkWriteColumn/int32_not_nullable-20 500 2136823 ns/op 1962.87 MB/s 5410591 B/op 240 allocs/op BenchmarkWriteColumn/int32_nullable-20 48 26604880 ns/op 157.65 MB/s 12053510 B/op 250 allocs/op BenchmarkWriteColumn/int64_not_nullable-20 340 3530509 ns/op 2376.03 MB/s 5439578 B/op 265 allocs/op BenchmarkWriteColumn/int64_nullable-20 44 27387334 ns/op 306.30 MB/s 11870305 B/op 260 allocs/op BenchmarkWriteColumn/float32_not_nullable-20 316 3479312 ns/op 1205.50 MB/s 5456685 B/op 1263 allocs/op BenchmarkWriteColumn/float32_nullable-20 50 25910872 ns/op 161.87 MB/s 12054582 B/op 1271 allocs/op BenchmarkWriteColumn/float64_not_nullable-20 249 4769664 ns/op 1758.74 MB/s 5486020 B/op 1292 allocs/op BenchmarkWriteColumn/float64_nullable-20 51 25496256 ns/op 329.01 MB/s 12140753 B/op 1284 allocs/op PASS ok github.com/apache/arrow/go/v17/parquet/pqarrow 11.492s ``` All of the nullable column cases average around a 16x-17x performance improvement. * GitHub Issue: #41541 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/parquet/internal/encoding/types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/parquet/internal/encoding/types.go b/go/parquet/internal/encoding/types.go index 51f48c797488f..147c1746c515a 100644 --- a/go/parquet/internal/encoding/types.go +++ b/go/parquet/internal/encoding/types.go @@ -185,7 +185,7 @@ func (b *PooledBufferWriter) Reserve(nbytes int) { b.buf = bufferPool.Get().(*memory.Buffer) } - newCap := utils.Max(b.buf.Cap()+b.offset, 256) + newCap := utils.Max(b.buf.Cap(), 256) for newCap < b.pos+nbytes { newCap = bitutil.NextPowerOf2(newCap) } From e04f5b4b905cfc37b5eaeea2c34e51349ae562b9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 May 2024 20:59:49 -0300 Subject: [PATCH 100/105] GH-41560: [C++] ChunkResolver: Implement ResolveMany and add unit tests (#41561) ### Rationale for this change I want `ResolveMany` to support me in the implementation of `Take` that doesn't `Concatenate` all the chunks from a `ChunkedArray` `values` parameter. ### What changes are included in this PR? - Implementation of `ChunkResolver::ResolveMany()` - Addition of missing unit tests for `ChunkResolver` ### Are these changes tested? Yes. By new unit tests. ### Are there any user-facing changes? No. `ChunkResolver` is an internal API at the moment (see #34535 for future plans). * GitHub Issue: #41560 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/chunk_resolver.cc | 80 +++++++- cpp/src/arrow/chunk_resolver.h | 128 +++++++++++- cpp/src/arrow/chunked_array_test.cc | 200 +++++++++++++++++++ cpp/src/arrow/compute/kernels/vector_sort.cc | 30 ++- 4 files changed, 407 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 29bccb52658f8..55eec53ced1c7 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -19,14 +19,14 @@ #include #include +#include #include #include #include "arrow/array.h" #include "arrow/record_batch.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { template @@ -54,6 +54,51 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { offsets[chunks.size()] = offset; return offsets; } + +/// \pre all the pre-conditions of ChunkResolver::ResolveMany() +/// \pre num_offsets - 1 <= std::numeric_limits::max() +template +void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets, + int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint, + IndexType* out_index_in_chunk_vec) { + auto* offsets = reinterpret_cast(signed_offsets); + const auto num_chunks = static_cast(num_offsets - 1); + // chunk_hint in [0, num_offsets) per the precondition. + for (int64_t i = 0; i < n_indices; i++) { + const auto index = static_cast(logical_index_vec[i]); + if (index >= offsets[chunk_hint] && + (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) { + out_chunk_index_vec[i] = chunk_hint; // hint is correct! + continue; + } + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + auto chunk_index = + ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + chunk_hint = static_cast(chunk_index); + out_chunk_index_vec[i] = chunk_hint; + } + if (out_index_in_chunk_vec != NULLPTR) { + for (int64_t i = 0; i < n_indices; i++) { + auto logical_index = logical_index_vec[i]; + auto chunk_index = out_chunk_index_vec[i]; + // chunk_index is in [0, chunks.size()] no matter what the + // value of logical_index is, so it's always safe to dereference + // offset_ as it contains chunks.size()+1 values. + out_index_in_chunk_vec[i] = + logical_index - static_cast(offsets[chunk_index]); +#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) + // Make it more likely that Valgrind/ASAN can catch an invalid memory + // access by poisoning out_index_in_chunk_vec[i] when the logical + // index is out-of-bounds. + if (chunk_index == num_chunks) { + out_index_in_chunk_vec[i] = std::numeric_limits::max(); + } +#endif + } + } +} + } // namespace ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept @@ -84,5 +129,32 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept { return *this; } -} // namespace internal -} // namespace arrow +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec, + uint8_t* out_chunk_index_vec, uint8_t chunk_hint, + uint8_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec, + uint32_t* out_chunk_index_vec, uint32_t chunk_hint, + uint32_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec, + uint16_t* out_chunk_index_vec, uint16_t chunk_hint, + uint16_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec, + uint64_t* out_chunk_index_vec, uint64_t chunk_hint, + uint64_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index c5dad1a17b18e..a2a3d5a864243 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include "arrow/type_fwd.h" @@ -27,6 +29,8 @@ namespace arrow::internal { +struct ChunkResolver; + struct ChunkLocation { /// \brief Index of the chunk in the array of chunks /// @@ -36,8 +40,17 @@ struct ChunkLocation { /// \brief Index of the value in the chunk /// - /// The value is undefined if chunk_index >= chunks.size() + /// The value is UNDEFINED if chunk_index >= chunks.size() int64_t index_in_chunk = 0; + + ChunkLocation() = default; + + ChunkLocation(int64_t chunk_index, int64_t index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {} + + bool operator==(ChunkLocation other) const { + return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; + } }; /// \brief An utility that incrementally resolves logical indices into @@ -60,12 +73,35 @@ struct ARROW_EXPORT ChunkResolver { explicit ChunkResolver(const std::vector& chunks) noexcept; explicit ChunkResolver(const RecordBatchVector& batches) noexcept; + /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. + /// + /// The first offset must be 0 and the last offset must be the logical length of the + /// chunked array. Each offset before the last represents the starting logical index of + /// the corresponding chunk. + explicit ChunkResolver(std::vector offsets) noexcept + : offsets_(std::move(offsets)), cached_chunk_(0) { +#ifndef NDEBUG + assert(offsets_.size() >= 1); + assert(offsets_[0] == 0); + for (size_t i = 1; i < offsets_.size(); i++) { + assert(offsets_[i] >= offsets_[i - 1]); + } +#endif + } + ChunkResolver(ChunkResolver&& other) noexcept; ChunkResolver& operator=(ChunkResolver&& other) noexcept; ChunkResolver(const ChunkResolver& other) noexcept; ChunkResolver& operator=(const ChunkResolver& other) noexcept; + int64_t logical_array_length() const { return offsets_.back(); } + int64_t num_chunks() const { return static_cast(offsets_.size()) - 1; } + + int64_t chunk_length(int64_t chunk_index) const { + return offsets_[chunk_index + 1] - offsets_[chunk_index]; + } + /// \brief Resolve a logical index to a ChunkLocation. /// /// The returned ChunkLocation contains the chunk index and the within-chunk index @@ -81,7 +117,7 @@ struct ARROW_EXPORT ChunkResolver { const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); const auto chunk_index = ResolveChunkIndex(index, cached_chunk); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; } /// \brief Resolve a logical index to a ChunkLocation. @@ -97,12 +133,70 @@ struct ARROW_EXPORT ChunkResolver { /// \return ChunkLocation with a valid chunk_index if index is within /// bounds, or with chunk_index == chunks.size() if logical index is /// `>= chunked_array.length()`. - inline ChunkLocation ResolveWithChunkIndexHint(int64_t index, - ChunkLocation hint) const { + inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { assert(hint.chunk_index < static_cast(offsets_.size())); const auto chunk_index = ResolveChunkIndex(index, hint.chunk_index); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; + } + + /// \brief Resolve `n_indices` logical indices to chunk indices. + /// + /// \pre 0 <= logical_index_vec[i] < logical_array_length() + /// (for well-defined and valid chunk index results) + /// \pre out_chunk_index_vec has space for `n_indices` + /// \pre chunk_hint in [0, chunks.size()] + /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) + /// \post if logical_index_vec[i] >= chunked_array.length(), then + /// out_chunk_index_vec[i] == chunks.size() + /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and + /// out_index_in_chunk_vec[i] are UNDEFINED + /// + /// \param n_indices The number of logical indices to resolve + /// \param logical_index_vec The logical indices to resolve + /// \param out_chunk_index_vec The output array where the chunk indices will be written + /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany + /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the + /// within-chunk indices will be written + /// \return false iff chunks.size() > std::numeric_limits::max() + template + [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, + IndexType* out_index_in_chunk_vec = NULLPTR) const { + if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { + // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). + constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); + // A ChunkedArray with enough empty chunks can make the index of a chunk + // exceed the logical index and thus the maximum value of IndexType. + const bool chunk_index_fits_on_type = + static_cast(offsets_.size() - 1) <= kMaxIndexTypeValue; + if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) { + return false; + } + // Since an index-in-chunk cannot possibly exceed the logical index being + // queried, we don't have to worry about these values not fitting on IndexType. + } + if constexpr (std::is_signed_v) { + // We interpret signed integers as unsigned and avoid having to generate double + // the amount of binary code to handle each integer width. + // + // Negative logical indices can become large values when cast to unsigned, and + // they are gracefully handled by ResolveManyImpl, but both the chunk index + // and the index in chunk values will be undefined in these cases. This + // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid + // logical index in the chunked array. + using U = std::make_unsigned_t; + ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), + reinterpret_cast(out_chunk_index_vec), + static_cast(chunk_hint), + reinterpret_cast(out_index_in_chunk_vec)); + } else { + static_assert(std::is_unsigned_v); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, + out_index_in_chunk_vec); + } + return true; } private: @@ -130,17 +224,33 @@ struct ARROW_EXPORT ChunkResolver { return chunk_index; } + /// \pre all the pre-conditions of ChunkResolver::ResolveMany() + /// \pre num_offsets - 1 <= std::numeric_limits::max() + void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const; + void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const; + void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const; + void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const; + + public: /// \brief Find the index of the chunk that contains the logical index. /// /// Any non-negative index is accepted. When `hi=num_offsets`, the largest /// possible return value is `num_offsets-1` which is equal to - /// `chunks.size()`. The is returned when the logical index is out-of-bounds. + /// `chunks.size()`. Which is returned when the logical index is greater or + /// equal the logical length of the chunked array. /// - /// \pre index >= 0 + /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned) /// \pre lo < hi /// \pre lo >= 0 && hi <= offsets_.size() static inline int64_t Bisect(int64_t index, const int64_t* offsets, int64_t lo, int64_t hi) { + return Bisect(static_cast(index), + reinterpret_cast(offsets), static_cast(lo), + static_cast(hi)); + } + + static inline int64_t Bisect(uint64_t index, const uint64_t* offsets, uint64_t lo, + uint64_t hi) { // Similar to std::upper_bound(), but slightly different as our offsets // array always starts with 0. auto n = hi - lo; @@ -148,8 +258,8 @@ struct ARROW_EXPORT ChunkResolver { // (lo < hi is guaranteed by the precondition). assert(n > 1 && "lo < hi is a precondition of Bisect"); do { - const int64_t m = n >> 1; - const int64_t mid = lo + m; + const uint64_t m = n >> 1; + const uint64_t mid = lo + m; if (index >= offsets[mid]) { lo = mid; n -= m; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 6ca52ab46ca68..e9cc283b53cd5 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/chunk_resolver.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/testing/builder.h" @@ -34,6 +35,9 @@ namespace arrow { +using internal::ChunkLocation; +using internal::ChunkResolver; + class TestChunkedArray : public ::testing::Test { protected: virtual void Construct() { @@ -310,4 +314,200 @@ TEST_F(TestChunkedArray, GetScalar) { ASSERT_RAISES(IndexError, carr.GetScalar(7)); } +// ChunkResolver tests + +using IndexTypes = ::testing::Types; + +TEST(TestChunkResolver, Resolve) { + ChunkResolver empty(std::vector({0})); // [] + // ChunkLocation::index_in_chunk is undefined when chunk_index==chunks.size(), + // so only chunk_index is compared in these cases. + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + ASSERT_EQ(one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); +} + +template +class TestChunkResolverMany : public ::testing::Test { + public: + using IndexType = T; + + Result> ResolveMany( + const ChunkResolver& resolver, const std::vector& logical_index_vec) { + const size_t n = logical_index_vec.size(); + std::vector chunk_index_vec; + chunk_index_vec.resize(n); + std::vector index_in_chunk_vec; + index_in_chunk_vec.resize(n); + bool valid = resolver.ResolveMany( + static_cast(n), logical_index_vec.data(), chunk_index_vec.data(), 0, + index_in_chunk_vec.data()); + if (ARROW_PREDICT_FALSE(!valid)) { + return Status::Invalid("index type doesn't fit possible chunk indexes"); + } + std::vector locations; + locations.reserve(n); + for (size_t i = 0; i < n; i++) { + auto chunk_index = static_cast(chunk_index_vec[i]); + auto index_in_chunk = static_cast(index_in_chunk_vec[i]); + locations.emplace_back(chunk_index, index_in_chunk); + } + return locations; + } + + void CheckResolveMany(const ChunkResolver& resolver, + const std::vector& logical_index_vec) { + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + IndexType logical_index = logical_index_vec[i]; + const auto expected = resolver.Resolve(logical_index); + ASSERT_LE(expected.chunk_index, resolver.num_chunks()); + if (expected.chunk_index == resolver.num_chunks()) { + // index_in_chunk is undefined in this case + ASSERT_EQ(locations[i].chunk_index, expected.chunk_index); + } else { + ASSERT_EQ(locations[i], expected); + } + } + } + + void TestBasics() { + std::vector logical_index_vec; + + ChunkResolver empty(std::vector({0})); // [] + logical_index_vec = {0, 0}; + CheckResolveMany(empty, logical_index_vec); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + logical_index_vec = {1, 0, 1}; + CheckResolveMany(one, logical_index_vec); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_and_empty, logical_index_vec); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_one_one, logical_index_vec); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + logical_index_vec = {10, 9, 8, 4, 3, 2, 1, 0, 1, 2, 3, 4, 8, 9, 10}; + CheckResolveMany(resolver, logical_index_vec); + } + + void TestOutOfBounds() { + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + + std::vector logical_index_vec = {10, 11, 12, 13, 14, 13, 11, 10}; + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (std::is_signed_v) { + std::vector logical_index_vec = {-1, -2, -3, -4, INT8_MIN}; + + ChunkResolver resolver(std::vector({0, 2, 128})); // [[0, 1], [2..127]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + // All the negative indices are greater than resolver.logical_array_length()-1 + // when cast to uint8_t. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (sizeof(IndexType) == 1) { + ChunkResolver resolver(std::vector( + {0, 2, 128, 129, 256})); // [[0, 1], [2..127], [128], [129, 255]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + if constexpr (sizeof(IndexType) == 1) { + // All the negative 8-bit indices are SMALLER than + // resolver.logical_array_length()=256 when cast to 8-bit unsigned integers. + // So the resolved locations might look valid, but they should not be trusted. + ASSERT_LT(locations[i].chunk_index, resolver.num_chunks()); + } else { + // All the negative indices are greater than resolver.logical_array_length() + // when cast to 16/32/64-bit unsigned integers. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + } + } + } + } + + void TestOverflow() { + const int64_t kMaxIndex = std::is_signed_v ? 127 : 255; + std::vector logical_index_vec = {0, 1, 2, + static_cast(kMaxIndex)}; + + // Overflows are rare because to make them possible, we need more chunks + // than logical elements in the ChunkedArray. That requires at least one + // empty chunk. + std::vector offsets; + for (int64_t i = 0; i <= kMaxIndex; i++) { + offsets.push_back(i); + } + ChunkResolver resolver{offsets}; + ASSERT_OK(ResolveMany(resolver, logical_index_vec)); + + offsets.push_back(kMaxIndex); // adding an empty chunk + ChunkResolver resolver_with_empty{offsets}; + if (sizeof(IndexType) == 1) { + ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } else { + ASSERT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } + } +}; + +TYPED_TEST_SUITE(TestChunkResolverMany, IndexTypes); + +TYPED_TEST(TestChunkResolverMany, Basics) { this->TestBasics(); } +TYPED_TEST(TestChunkResolverMany, OutOfBounds) { this->TestOutOfBounds(); } +TYPED_TEST(TestChunkResolverMany, Overflow) { this->TestOverflow(); } + } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index db2023ef04cad..ad22fa8d365c4 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -747,15 +747,13 @@ class TableSorter { auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, [&](uint64_t left, uint64_t right) { // First column is either null or nan - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); auto chunk_left = first_sort_key.GetChunk(left_loc); auto chunk_right = first_sort_key.GetChunk(right_loc); const auto left_is_null = chunk_left.IsNull(); @@ -786,15 +784,13 @@ class TableSorter { // Untyped implementation auto& comparator = comparator_; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, [&](uint64_t left, uint64_t right) { // First column is always null - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); return comparator.Compare(left_loc, right_loc, 1); }); // Copy back temp area into main buffer @@ -812,15 +808,13 @@ class TableSorter { auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(range_begin, range_middle, range_middle, range_end, temp_indices, [&](uint64_t left, uint64_t right) { // Both values are never null nor NaN. - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); auto chunk_left = first_sort_key.GetChunk(left_loc); auto chunk_right = first_sort_key.GetChunk(right_loc); DCHECK(!chunk_left.IsNull()); From 084387c56e45bf7e8335c28e14a2e61b16515ad5 Mon Sep 17 00:00:00 2001 From: James Duong Date: Wed, 15 May 2024 17:22:34 -0700 Subject: [PATCH 101/105] GH-39204: [Format][FlightRPC][Docs] Stabilize Flight SQL (#41657) Update documentation, protobufs, and class documentation to remove experimental tags from Flight and Flight SQL documentation. ### Rationale for this change Flight SQL has been used by multiple databases now and has been voted as stable per the mailing list discussion: [https://lists.apache.org/thread/qoshg8mln3t2ovr90o1yklz4yrpv503h](url) ### What changes are included in this PR? Update protobuf, class comments, and user documentation to remove references to Flight and Flight SQL being experimental. This change excludes the UCX transport and the session option messages ### Are these changes tested? No, documentation only. ### Are there any user-facing changes? User documentation. * GitHub Issue: #39204 Authored-by: James Duong Signed-off-by: Sutou Kouhei --- cpp/src/arrow/flight/client.h | 8 +---- cpp/src/arrow/flight/cookie_internal.cc | 3 +- cpp/src/arrow/flight/middleware.h | 2 +- cpp/src/arrow/flight/server.h | 3 +- cpp/src/arrow/flight/server_middleware.h | 3 +- cpp/src/arrow/flight/sql/server.cc | 3 +- cpp/src/arrow/flight/sql/server.h | 3 +- .../flight/sql/server_session_middleware.h | 1 - cpp/src/arrow/flight/transport.h | 4 --- cpp/src/arrow/flight/types.h | 4 +-- cpp/src/arrow/flight/types_async.h | 4 --- docs/source/cpp/api/flightsql.rst | 2 -- docs/source/format/FlightSql.rst | 3 -- docs/source/java/overview.rst | 4 +-- format/FlightSql.proto | 32 ------------------- java/flight/flight-core/pom.xml | 2 +- .../arrow/flight/ServerSessionMiddleware.java | 2 -- java/flight/flight-sql/pom.xml | 2 +- 18 files changed, 12 insertions(+), 73 deletions(-) diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index 330fa8bad730d..613903108949e 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -/// \brief Implementation of Flight RPC client. API should be -/// considered experimental for now +/// \brief Implementation of Flight RPC client. #pragma once @@ -177,7 +176,6 @@ class ARROW_FLIGHT_EXPORT FlightMetadataReader { }; /// \brief Client class for Arrow Flight RPC services. -/// API experimental for now class ARROW_FLIGHT_EXPORT FlightClient { public: ~FlightClient(); @@ -275,8 +273,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// \param[in] options Per-RPC options /// \param[in] descriptor the dataset request /// \param[in] listener Callbacks for response and RPC completion - /// - /// This API is EXPERIMENTAL. void GetFlightInfoAsync(const FlightCallOptions& options, const FlightDescriptor& descriptor, std::shared_ptr> listener); @@ -288,8 +284,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// \brief Asynchronous GetFlightInfo returning a Future. /// \param[in] options Per-RPC options /// \param[in] descriptor the dataset request - /// - /// This API is EXPERIMENTAL. arrow::Future GetFlightInfoAsync(const FlightCallOptions& options, const FlightDescriptor& descriptor); arrow::Future GetFlightInfoAsync(const FlightDescriptor& descriptor) { diff --git a/cpp/src/arrow/flight/cookie_internal.cc b/cpp/src/arrow/flight/cookie_internal.cc index 8f41106ebce5c..75a10d148bf47 100644 --- a/cpp/src/arrow/flight/cookie_internal.cc +++ b/cpp/src/arrow/flight/cookie_internal.cc @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces for defining middleware for Flight clients. Currently -// experimental. +// Interfaces for defining middleware for Flight clients. #include "arrow/flight/cookie_internal.h" #include "arrow/flight/client.h" diff --git a/cpp/src/arrow/flight/middleware.h b/cpp/src/arrow/flight/middleware.h index 84448097ff019..d717e396a8b68 100644 --- a/cpp/src/arrow/flight/middleware.h +++ b/cpp/src/arrow/flight/middleware.h @@ -16,7 +16,7 @@ // under the License. // Interfaces for defining middleware for Flight clients and -// servers. Currently experimental. +// servers. #pragma once diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index ffcffe12e3c78..8d73353ab16c1 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces to use for defining Flight RPC servers. API should be considered -// experimental for now +// Interfaces to use for defining Flight RPC servers. #pragma once diff --git a/cpp/src/arrow/flight/server_middleware.h b/cpp/src/arrow/flight/server_middleware.h index 030f1a17c2100..3a3e6f8616ed6 100644 --- a/cpp/src/arrow/flight/server_middleware.h +++ b/cpp/src/arrow/flight/server_middleware.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces for defining middleware for Flight servers. Currently -// experimental. +// Interfaces for defining middleware for Flight servers. #pragma once diff --git a/cpp/src/arrow/flight/sql/server.cc b/cpp/src/arrow/flight/sql/server.cc index cae3542b4faf8..63d1f5c5225fa 100644 --- a/cpp/src/arrow/flight/sql/server.cc +++ b/cpp/src/arrow/flight/sql/server.cc @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces to use for defining Flight RPC servers. API should be considered -// experimental for now +// Interfaces to use for defining Flight RPC servers. // Platform-specific defines #include "arrow/flight/platform.h" diff --git a/cpp/src/arrow/flight/sql/server.h b/cpp/src/arrow/flight/sql/server.h index 7b5d71678f3de..7130e96987b89 100644 --- a/cpp/src/arrow/flight/sql/server.h +++ b/cpp/src/arrow/flight/sql/server.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces to use for defining Flight RPC servers. API should be considered -// experimental for now +// Interfaces to use for defining Flight RPC servers. #pragma once diff --git a/cpp/src/arrow/flight/sql/server_session_middleware.h b/cpp/src/arrow/flight/sql/server_session_middleware.h index 021793de3de32..6eb11041a08bd 100644 --- a/cpp/src/arrow/flight/sql/server_session_middleware.h +++ b/cpp/src/arrow/flight/sql/server_session_middleware.h @@ -16,7 +16,6 @@ // under the License. // Middleware for handling Flight SQL Sessions including session cookie handling. -// Currently experimental. #pragma once diff --git a/cpp/src/arrow/flight/transport.h b/cpp/src/arrow/flight/transport.h index 4029aa5223deb..4ce50534023fc 100644 --- a/cpp/src/arrow/flight/transport.h +++ b/cpp/src/arrow/flight/transport.h @@ -19,8 +19,6 @@ /// Internal (but not private) interface for implementing /// alternate network transports in Flight. /// -/// \warning EXPERIMENTAL. Subject to change. -/// /// To implement a transport, implement ServerTransport and /// ClientTransport, and register the desired URI schemes with /// TransportRegistry. Flight takes care of most of the per-RPC @@ -248,8 +246,6 @@ TransportRegistry* GetDefaultTransportRegistry(); /// Transport implementations may subclass this to store their own /// state, and stash an instance in a user-supplied AsyncListener via /// ClientTransport::GetAsyncRpc and ClientTransport::SetAsyncRpc. -/// -/// This API is EXPERIMENTAL. class ARROW_FLIGHT_EXPORT AsyncRpc { public: virtual ~AsyncRpc() = default; diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index b3df8377b8ffd..cdf03f21041ee 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Data structure for Flight RPC. API should be considered experimental for now +// Data structure for Flight RPC. #pragma once @@ -1115,8 +1115,6 @@ std::string ToString(TransportStatusCode code); /// instead of trying to translate to Arrow Status. /// /// Currently, only attached to the Status passed to AsyncListener::OnFinish. -/// -/// This API is EXPERIMENTAL. class ARROW_FLIGHT_EXPORT TransportStatusDetail : public StatusDetail { public: constexpr static const char* kTypeId = "flight::TransportStatusDetail"; diff --git a/cpp/src/arrow/flight/types_async.h b/cpp/src/arrow/flight/types_async.h index a241e64fb4e49..d5ed48d8a6438 100644 --- a/cpp/src/arrow/flight/types_async.h +++ b/cpp/src/arrow/flight/types_async.h @@ -31,8 +31,6 @@ namespace arrow::flight { /// @{ /// \brief Non-templated state for an async RPC. -/// -/// This API is EXPERIMENTAL. class ARROW_FLIGHT_EXPORT AsyncListenerBase { public: AsyncListenerBase(); @@ -57,8 +55,6 @@ class ARROW_FLIGHT_EXPORT AsyncListenerBase { /// A single listener may not be used for multiple concurrent RPC /// calls. The application MUST hold the listener alive until /// OnFinish() is called and has finished. -/// -/// This API is EXPERIMENTAL. template class ARROW_FLIGHT_EXPORT AsyncListener : public AsyncListenerBase { public: diff --git a/docs/source/cpp/api/flightsql.rst b/docs/source/cpp/api/flightsql.rst index 565b605108d9f..0f49a76f20687 100644 --- a/docs/source/cpp/api/flightsql.rst +++ b/docs/source/cpp/api/flightsql.rst @@ -22,8 +22,6 @@ Arrow Flight SQL ================ -.. note:: Flight SQL is currently experimental and APIs are subject to change. - Common Types ============ diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index 181efce286e70..9c3523755f3ae 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -32,9 +32,6 @@ with any database that supports the necessary endpoints. Flight SQL clients wrap the underlying Flight client to provide methods for the new RPC methods described here. -.. warning:: Flight SQL is **experimental** and changes to the - protocol may still be made. - RPC Methods =========== diff --git a/docs/source/java/overview.rst b/docs/source/java/overview.rst index 9d9cbad8a26c1..7780ee32ec9bc 100644 --- a/docs/source/java/overview.rst +++ b/docs/source/java/overview.rst @@ -54,10 +54,10 @@ but some modules are JNI bindings to the C++ library. - (Experimental) A library for converting JDBC data to Arrow data. - Native * - flight-core - - (Experimental) An RPC mechanism for transferring ValueVectors. + - An RPC mechanism for transferring ValueVectors. - Native * - flight-sql - - (Experimental) Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight. + - Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight. - Native * - flight-integration-tests - Integration tests for Flight RPC. diff --git a/format/FlightSql.proto b/format/FlightSql.proto index bf3fcb6c3d229..6fca141d692a7 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -43,7 +43,6 @@ package arrow.flight.protocol.sql; * where there is one row per requested piece of metadata information. */ message CommandGetSqlInfo { - option (experimental) = true; /* * Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide @@ -1131,7 +1130,6 @@ enum Searchable { * The returned data should be ordered by data_type and then by type_name. */ message CommandGetXdbcTypeInfo { - option (experimental) = true; /* * Specifies the data type to search for the info. @@ -1153,7 +1151,6 @@ message CommandGetXdbcTypeInfo { * The returned data should be ordered by catalog_name. */ message CommandGetCatalogs { - option (experimental) = true; } /* @@ -1171,7 +1168,6 @@ message CommandGetCatalogs { * The returned data should be ordered by catalog_name, then db_schema_name. */ message CommandGetDbSchemas { - option (experimental) = true; /* * Specifies the Catalog to search for the tables. @@ -1219,7 +1215,6 @@ message CommandGetDbSchemas { * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. */ message CommandGetTables { - option (experimental) = true; /* * Specifies the Catalog to search for the tables. @@ -1272,7 +1267,6 @@ message CommandGetTables { * The returned data should be ordered by table_type. */ message CommandGetTableTypes { - option (experimental) = true; } /* @@ -1293,7 +1287,6 @@ message CommandGetTableTypes { * The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. */ message CommandGetPrimaryKeys { - option (experimental) = true; /* * Specifies the catalog to search for the table. @@ -1348,7 +1341,6 @@ enum UpdateDeleteRules { * update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. */ message CommandGetExportedKeys { - option (experimental) = true; /* * Specifies the catalog to search for the foreign key table. @@ -1399,7 +1391,6 @@ message CommandGetExportedKeys { * - 4 = SET DEFAULT */ message CommandGetImportedKeys { - option (experimental) = true; /* * Specifies the catalog to search for the primary key table. @@ -1452,7 +1443,6 @@ message CommandGetImportedKeys { * - 4 = SET DEFAULT */ message CommandGetCrossReference { - option (experimental) = true; /** * The catalog name where the parent table is. @@ -1499,7 +1489,6 @@ message CommandGetCrossReference { * Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. */ message ActionCreatePreparedStatementRequest { - option (experimental) = true; // The valid SQL string to create a prepared statement for. string query = 1; @@ -1512,7 +1501,6 @@ message ActionCreatePreparedStatementRequest { * An embedded message describing a Substrait plan to execute. */ message SubstraitPlan { - option (experimental) = true; // The serialized substrait.Plan to create a prepared statement for. // XXX(ARROW-16902): this is bytes instead of an embedded message @@ -1529,7 +1517,6 @@ message SubstraitPlan { * Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. */ message ActionCreatePreparedSubstraitPlanRequest { - option (experimental) = true; // The serialized substrait.Plan to create a prepared statement for. SubstraitPlan plan = 1; @@ -1548,7 +1535,6 @@ message ActionCreatePreparedSubstraitPlanRequest { * The result should be wrapped in a google.protobuf.Any message. */ message ActionCreatePreparedStatementResult { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1570,7 +1556,6 @@ message ActionCreatePreparedStatementResult { * Closes server resources associated with the prepared statement handle. */ message ActionClosePreparedStatementRequest { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1581,7 +1566,6 @@ message ActionClosePreparedStatementRequest { * Begins a transaction. */ message ActionBeginTransactionRequest { - option (experimental) = true; } /* @@ -1592,7 +1576,6 @@ message ActionBeginTransactionRequest { * FLIGHT_SQL_TRANSACTION_SUPPORT_SAVEPOINT. */ message ActionBeginSavepointRequest { - option (experimental) = true; // The transaction to which a savepoint belongs. bytes transaction_id = 1; @@ -1610,7 +1593,6 @@ message ActionBeginSavepointRequest { * The result should be wrapped in a google.protobuf.Any message. */ message ActionBeginTransactionResult { - option (experimental) = true; // Opaque handle for the transaction on the server. bytes transaction_id = 1; @@ -1626,7 +1608,6 @@ message ActionBeginTransactionResult { * The result should be wrapped in a google.protobuf.Any message. */ message ActionBeginSavepointResult { - option (experimental) = true; // Opaque handle for the savepoint on the server. bytes savepoint_id = 1; @@ -1641,7 +1622,6 @@ message ActionBeginSavepointResult { * invalidated, as are all associated savepoints. */ message ActionEndTransactionRequest { - option (experimental) = true; enum EndTransaction { END_TRANSACTION_UNSPECIFIED = 0; @@ -1667,7 +1647,6 @@ message ActionEndTransactionRequest { * savepoints created after the current savepoint. */ message ActionEndSavepointRequest { - option (experimental) = true; enum EndSavepoint { END_SAVEPOINT_UNSPECIFIED = 0; @@ -1702,7 +1681,6 @@ message ActionEndSavepointRequest { * - GetFlightInfo: execute the query. */ message CommandStatementQuery { - option (experimental) = true; // The SQL syntax. string query = 1; @@ -1729,7 +1707,6 @@ message CommandStatementQuery { * - DoPut: execute the query. */ message CommandStatementSubstraitPlan { - option (experimental) = true; // A serialized substrait.Plan SubstraitPlan plan = 1; @@ -1742,7 +1719,6 @@ message CommandStatementSubstraitPlan { * This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. */ message TicketStatementQuery { - option (experimental) = true; // Unique identifier for the instance of the statement to execute. bytes statement_handle = 1; @@ -1770,7 +1746,6 @@ message TicketStatementQuery { * - GetFlightInfo: execute the prepared statement instance. */ message CommandPreparedStatementQuery { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1781,7 +1756,6 @@ message CommandPreparedStatementQuery { * for the RPC call DoPut to cause the server to execute the included SQL update. */ message CommandStatementUpdate { - option (experimental) = true; // The SQL syntax. string query = 1; @@ -1795,7 +1769,6 @@ message CommandStatementUpdate { * prepared statement handle as an update. */ message CommandPreparedStatementUpdate { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1807,7 +1780,6 @@ message CommandPreparedStatementUpdate { * FlightData into the target destination. */ message CommandStatementIngest { - option (experimental) = true; // Options for table definition behavior message TableDefinitionOptions { @@ -1866,7 +1838,6 @@ message CommandStatementIngest { * in the request, containing results from the update. */ message DoPutUpdateResult { - option (experimental) = true; // The number of records updated. A return value of -1 represents // an unknown updated record count. @@ -1880,7 +1851,6 @@ message DoPutUpdateResult { * can continue as though the fields in this message were not provided or set to sensible default values. */ message DoPutPreparedStatementResult { - option (experimental) = true; // Represents a (potentially updated) opaque handle for the prepared statement on the server. // Because the handle could potentially be updated, any previous handles for this prepared @@ -1912,7 +1882,6 @@ message DoPutPreparedStatementResult { */ message ActionCancelQueryRequest { option deprecated = true; - option (experimental) = true; // The result of the GetFlightInfo RPC that initiated the query. // XXX(ARROW-16902): this must be a serialized FlightInfo, but is @@ -1931,7 +1900,6 @@ message ActionCancelQueryRequest { */ message ActionCancelQueryResult { option deprecated = true; - option (experimental) = true; enum CancelResult { // The cancellation status is unknown. Servers should avoid using diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 163b4c24031b1..4c1002ae75f04 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -20,7 +20,7 @@ flight-core jar Arrow Flight Core - (Experimental)An RPC mechanism for transferring ValueVectors. + An RPC mechanism for transferring ValueVectors. 1 diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java index 7091caa5e98bc..af22cd8aade22 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java @@ -26,8 +26,6 @@ /** * Middleware for handling Flight SQL Sessions including session cookie handling. - * - * Currently experimental. */ public class ServerSessionMiddleware implements FlightServerMiddleware { Factory factory; diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml index cf466ab1720cf..f5926d6e68485 100644 --- a/java/flight/flight-sql/pom.xml +++ b/java/flight/flight-sql/pom.xml @@ -20,7 +20,7 @@ flight-sql jar Arrow Flight SQL - (Experimental)Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight + Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight 1 From 1c15c88b4b62b19e7a226cc9e11946af1d9ac343 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Thu, 16 May 2024 06:38:57 +0530 Subject: [PATCH 102/105] GH-40943: [Java] Implement RangeEqualsVisitor for StringView (#41636) ### Rationale for this change Adding `RangeEqualsVisitor` for StringView as discussed in https://github.com/apache/arrow/issues/40943. ### What changes are included in this PR? Including `RangeEqualsVisitor` visitor method and test cases to validate it. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #40943 Authored-by: Vibhatha Abeykoon Signed-off-by: David Li --- .../vector/BaseVariableWidthViewVector.java | 21 +++-- .../vector/compare/RangeEqualsVisitor.java | 85 ++++++++++++++++++- .../compare/TestRangeEqualsVisitor.java | 71 ++++++++++++++-- 3 files changed, 161 insertions(+), 16 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java index 2f80775a48f58..ec700a0dc2592 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -46,7 +46,7 @@ */ public abstract class BaseVariableWidthViewVector extends BaseValueVector implements VariableWidthFieldVector { // A single element of a view comprises 16 bytes - protected static final int ELEMENT_SIZE = 16; + public static final int ELEMENT_SIZE = 16; public static final int INITIAL_VIEW_VALUE_ALLOCATION = 4096; private static final int INITIAL_BYTE_COUNT = INITIAL_VIEW_VALUE_ALLOCATION * ELEMENT_SIZE; private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); @@ -70,14 +70,14 @@ public abstract class BaseVariableWidthViewVector extends BaseValueVector implem * * */ // 12 byte unsigned int to track inline views - protected static final int INLINE_SIZE = 12; + public static final int INLINE_SIZE = 12; // The first 4 bytes of view are allocated for length - protected static final int LENGTH_WIDTH = 4; + public static final int LENGTH_WIDTH = 4; // The second 4 bytes of view are allocated for prefix width - protected static final int PREFIX_WIDTH = 4; + public static final int PREFIX_WIDTH = 4; // The third 4 bytes of view are allocated for buffer index - protected static final int BUF_INDEX_WIDTH = 4; - protected static final byte[] EMPTY_BYTE_ARRAY = new byte[]{}; + public static final int BUF_INDEX_WIDTH = 4; + public static final byte[] EMPTY_BYTE_ARRAY = new byte[]{}; protected ArrowBuf validityBuffer; // The view buffer is used to store the variable width view elements protected ArrowBuf viewBuffer; @@ -158,6 +158,15 @@ public ArrowBuf getDataBuffer() { return viewBuffer; } + /** + * Get the buffers that store the data for views in the vector. + * + * @return buffer + */ + public List getDataBuffers() { + return dataBuffers; + } + /** * BaseVariableWidthViewVector doesn't support offset buffer. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 56220d270fa9b..28da2a86a53c8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.function.BiFunction; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.util.ByteFunctionHelpers; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; @@ -165,7 +166,10 @@ public Boolean visit(BaseLargeVariableWidthVector left, Range range) { @Override public Boolean visit(BaseVariableWidthViewVector left, Range range) { - throw new UnsupportedOperationException("View vectors are not supported."); + if (!validate(left)) { + return false; + } + return compareBaseVariableWidthViewVectors(range); } @Override @@ -450,6 +454,85 @@ protected boolean compareBaseLargeVariableWidthVectors(Range range) { return true; } + protected boolean compareBaseVariableWidthViewVectors(Range range) { + BaseVariableWidthViewVector leftVector = (BaseVariableWidthViewVector) left; + BaseVariableWidthViewVector rightVector = (BaseVariableWidthViewVector) right; + + final ArrowBuf leftViewBuffer = leftVector.getDataBuffer(); + final ArrowBuf rightViewBuffer = rightVector.getDataBuffer(); + + final int elementSize = BaseVariableWidthViewVector.ELEMENT_SIZE; + final int lengthWidth = BaseVariableWidthViewVector.LENGTH_WIDTH; + final int prefixWidth = BaseVariableWidthViewVector.PREFIX_WIDTH; + final int bufIndexWidth = BaseVariableWidthViewVector.BUF_INDEX_WIDTH; + + List leftDataBuffers = leftVector.getDataBuffers(); + List rightDataBuffers = rightVector.getDataBuffers(); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + if (isNull) { + continue; + } + + int startLeftByteOffset = leftIndex * elementSize; + + int startRightByteOffset = rightIndex * elementSize; + + int leftDataBufferValueLength = leftVector.getValueLength(leftIndex); + int rightDataBufferValueLength = rightVector.getValueLength(rightIndex); + + if (leftDataBufferValueLength != rightDataBufferValueLength) { + return false; + } + + if (leftDataBufferValueLength > BaseVariableWidthViewVector.INLINE_SIZE) { + // if the value is stored in the dataBuffers + int leftDataBufferIndex = leftViewBuffer.getInt(startLeftByteOffset + lengthWidth + prefixWidth); + int rightDataBufferIndex = rightViewBuffer.getInt(startRightByteOffset + lengthWidth + prefixWidth); + + final int leftDataOffset = + leftViewBuffer.getInt(startLeftByteOffset + lengthWidth + prefixWidth + bufIndexWidth); + final int rightDataOffset = + rightViewBuffer.getInt(startRightByteOffset + lengthWidth + prefixWidth + bufIndexWidth); + + ArrowBuf leftDataBuffer = leftDataBuffers.get(leftDataBufferIndex); + ArrowBuf rightDataBuffer = rightDataBuffers.get(rightDataBufferIndex); + + // check equality in the considered string stored in the dataBuffers + int retDataBuf = ByteFunctionHelpers.equal( + leftDataBuffer, leftDataOffset, leftDataOffset + leftDataBufferValueLength, + rightDataBuffer, rightDataOffset, rightDataOffset + rightDataBufferValueLength); + + if (retDataBuf == 0) { + return false; + } + } else { + // if the value is stored in the view + final int leftDataOffset = startLeftByteOffset + lengthWidth; + final int rightDataOffset = startRightByteOffset + lengthWidth; + + // check equality in the considered string stored in the view + int retDataBuf = ByteFunctionHelpers.equal( + leftViewBuffer, leftDataOffset, leftDataOffset + leftDataBufferValueLength, + rightViewBuffer, rightDataOffset, rightDataOffset + rightDataBufferValueLength); + + if (retDataBuf == 0) { + return false; + } + } + + } + return true; + } + protected boolean compareListVectors(Range range) { ListVector leftVector = (ListVector) left; ListVector rightVector = (ListVector) right; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index ab8c6c634891e..c3e7ef8bf8b08 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -18,8 +18,8 @@ package org.apache.arrow.vector.compare; import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.charset.Charset; import java.util.Arrays; @@ -33,6 +33,7 @@ import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.compare.util.ValueEpsilonEqualizers; import org.apache.arrow.vector.complex.DenseUnionVector; @@ -53,16 +54,16 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; -import org.junit.After; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; public class TestRangeEqualsVisitor { private BufferAllocator allocator; - @Before + @BeforeEach public void init() { allocator = new RootAllocator(Long.MAX_VALUE); } @@ -71,8 +72,11 @@ public void init() { private static final byte[] STR1 = "AAAAA1".getBytes(utf8Charset); private static final byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); private static final byte[] STR3 = "CCCC3".getBytes(utf8Charset); + private static final byte[] STR4 = "12345678901234A".getBytes(utf8Charset); + private static final byte[] STR5 = "A2345678901234ABC".getBytes(utf8Charset); + private static final byte[] STR6 = "AB45678901234ABCD".getBytes(utf8Charset); - @After + @AfterEach public void terminate() throws Exception { allocator.close(); } @@ -132,6 +136,55 @@ public void testBaseVariableVectorRangeEquals() { } } + @Test + public void testBaseVariableViewVectorRangeEquals() { + try (final ViewVarCharVector vector1 = new ViewVarCharVector("varchar", allocator); + final ViewVarCharVector vector2 = new ViewVarCharVector("varchar", allocator)) { + + setVector(vector1, STR1, STR2, STR4, STR3, STR2, STR5, STR1, STR6, STR1, STR2, STR4); + setVector(vector2, STR1, STR2, STR4, STR3, STR2, STR5, STR1, STR6, STR1, STR2, STR4); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + // inclusion of long string in the middle + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + assertFalse(visitor.rangeEquals(new Range(0, 1, 4))); + // inclusion of long string at the start + assertTrue(visitor.rangeEquals(new Range(2, 2, 4))); + assertFalse(visitor.rangeEquals(new Range(2, 5, 4))); + // inclusion of long string at the end + assertTrue(visitor.rangeEquals(new Range(4, 4, 4))); + // unequal range + assertTrue(visitor.rangeEquals(new Range(8, 0, 3))); + assertFalse(visitor.rangeEquals(new Range(4, 5, 3))); + + // checking the same ranges when nulls are set + + vector1.setNull(1); + vector2.setNull(1); + + vector1.setNull(3); + vector2.setNull(3); + + vector1.setNull(5); + vector2.setNull(5); + + vector1.setNull(9); + vector2.setNull(9); + + // inclusion of long string in the middle + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + assertFalse(visitor.rangeEquals(new Range(0, 1, 4))); + // inclusion of long string at the start + assertTrue(visitor.rangeEquals(new Range(2, 2, 4))); + assertFalse(visitor.rangeEquals(new Range(2, 5, 4))); + // inclusion of long string at the end + assertTrue(visitor.rangeEquals(new Range(4, 4, 4))); + // unequal range + assertTrue(visitor.rangeEquals(new Range(8, 0, 3))); + assertFalse(visitor.rangeEquals(new Range(4, 5, 3))); + } + } + @Test public void testListVectorWithDifferentChild() { try (final ListVector vector1 = ListVector.empty("list", allocator); @@ -476,7 +529,7 @@ public void testDenseUnionVectorEquals() { } } - @Ignore + @Disabled @Test public void testEqualsWithOutTypeCheck() { try (final IntVector intVector = new IntVector("int", allocator); From 0574988e328d483446b2b758bbc8c26bf1c82196 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Thu, 16 May 2024 14:18:31 +0530 Subject: [PATCH 103/105] GH-41287: [Java] ListViewVector Implementation (#41285) ### Rationale for this change Apache Arrow format defines ListView and this has been introduced into other language bindings and the objective of this PR is to provide the initial ListView support to Java by adding `ListViewVector`. ### Non-Goals The following list of issues propose the extended work depending on this PR. They were separated to streamline the implementation process. - [ ] https://github.com/apache/arrow/issues/41272 - [ ] https://github.com/apache/arrow/issues/41286 - [ ] https://github.com/apache/arrow/issues/41290 - [ ] https://github.com/apache/arrow/issues/41288 - [ ] https://github.com/apache/arrow/issues/41289 - [ ] https://github.com/apache/arrow/issues/41269 - [ ] https://github.com/apache/arrow/issues/41291 - [ ] https://github.com/apache/arrow/issues/41292 - [ ] https://github.com/apache/arrow/issues/41270 - [ ] https://github.com/apache/arrow/issues/41293 - [ ] https://github.com/apache/arrow/issues/41294 - [ ] https://github.com/apache/arrow/issues/41569 - [ ] https://github.com/apache/arrow/issues/41570 - [ ] https://github.com/apache/arrow/issues/41584 - [ ] https://github.com/apache/arrow/issues/41585 ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41287 Lead-authored-by: Vibhatha Abeykoon Co-authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- .../binder/ColumnBinderArrowTypeVisitor.java | 5 + .../arrow/c/BufferImportTypeVisitor.java | 6 + .../jdbc/utils/AvaticaParameterBinder.java | 5 + .../arrow/driver/jdbc/utils/ConvertUtils.java | 5 + .../src/main/codegen/data/ArrowTypes.tdd | 5 + .../codegen/templates/UnionListWriter.java | 24 +- .../main/codegen/templates/UnionReader.java | 2 +- .../org/apache/arrow/vector/BufferLayout.java | 13 +- .../org/apache/arrow/vector/TypeLayout.java | 20 +- .../complex/BaseRepeatedValueViewVector.java | 405 ++++ .../arrow/vector/complex/ListViewVector.java | 872 +++++++++ .../vector/complex/impl/PromotableWriter.java | 30 + .../org/apache/arrow/vector/types/Types.java | 21 + .../arrow/vector/TestListViewVector.java | 1651 +++++++++++++++++ 14 files changed, 3059 insertions(+), 5 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java index 7d50676688e0f..7420a8c23dd48 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java @@ -256,4 +256,9 @@ public ColumnBinder visit(ArrowType.Interval type) { public ColumnBinder visit(ArrowType.Duration type) { throw new UnsupportedOperationException("No column binder implemented for type " + type); } + + @Override + public ColumnBinder visit(ArrowType.ListView type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } } diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index bc6139cc84c54..99873dadad242 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -53,6 +53,7 @@ import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.util.DataSizeRoundingUtil; /** @@ -328,4 +329,9 @@ public List visit(ArrowType.Interval type) { public List visit(ArrowType.Duration type) { return Arrays.asList(maybeImportBitmap(type), importFixedBytes(type, 1, DurationVector.TYPE_WIDTH)); } + + @Override + public List visit(ListView type) { + throw new UnsupportedOperationException("Importing buffers for view type: " + type + " not supported"); + } } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index fd9127c226910..70a58ff440ed4 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -254,6 +254,11 @@ public Boolean visit(ArrowType.Interval type) { public Boolean visit(ArrowType.Duration type) { return new DurationAvaticaParameterConverter(type).bindParameter(vector, typedValue, index); } + + @Override + public Boolean visit(ArrowType.ListView type) { + throw new UnsupportedOperationException("Binding is not yet supported for type " + type); + } } } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java index 93b5faaef32c7..6ec33fafcfa46 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java @@ -274,6 +274,11 @@ public AvaticaParameter visit(ArrowType.Interval type) { public AvaticaParameter visit(ArrowType.Duration type) { return new DurationAvaticaParameterConverter(type).createParameter(field); } + + @Override + public AvaticaParameter visit(ArrowType.ListView type) { + throw new UnsupportedOperationException("AvaticaParameter not yet supported for type " + type); + } } } diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 9fe40f2319bfd..72df4779793f0 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -129,6 +129,11 @@ name: "Duration", fields: [{name: "unit", type: short, valueType: TimeUnit}], complex: false + }, + { + name: "ListView", + fields: [], + complex: true } ] } diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index 5c0565ee27175..eeb964c055f71 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -26,7 +26,7 @@ import java.math.BigDecimal; <@pp.dropOutputFile /> -<#list ["List", "LargeList"] as listName> +<#list ["List", "ListView", "LargeList"] as listName> <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/Union${listName}Writer.java" /> @@ -59,6 +59,10 @@ public class Union${listName}Writer extends AbstractFieldWriter { private static final int OFFSET_WIDTH = 4; + <#if listName = "ListView"> + private static final long SIZE_WIDTH = 4; + + public Union${listName}Writer(${listName}Vector vector) { this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); } @@ -193,6 +197,24 @@ public void endList() { setPosition(idx() + 1); listStarted = false; } + <#elseif listName == "ListView"> + @Override + public void startList() { + vector.startNewValue(idx()); + writer.setPosition(vector.getOffsetBuffer().getInt((idx()) * OFFSET_WIDTH)); + listStarted = true; + } + + @Override + public void endList() { + int sizeUptoIdx = 0; + for (int i = 0; i < idx(); i++) { + sizeUptoIdx += vector.getSizeBuffer().getInt(i * SIZE_WIDTH); + } + vector.getSizeBuffer().setInt(idx() * SIZE_WIDTH, writer.idx() - sizeUptoIdx); + setPosition(idx() + 1); + listStarted = false; + } <#else> @Override public void startList() { diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 956bc91e9185c..243bd832255c2 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -39,7 +39,7 @@ @SuppressWarnings("unused") public class UnionReader extends AbstractFieldReader { - private static final int NUM_SUPPORTED_TYPES = 48; + private static final int NUM_SUPPORTED_TYPES = 49; private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES]; public UnionVector data; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java index 9725693348a48..4eeb92a0c9199 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -28,12 +28,18 @@ public class BufferLayout { /** * Enumeration of the different logical types a buffer can have. + * Data buffer is common to most of the layouts. + * Offset buffer is used for variable width types. + * Validity buffer is used for nullable types. + * Type buffer is used for Union types. + * Size buffer is used for ListView and LargeListView types. */ public enum BufferType { DATA("DATA"), OFFSET("OFFSET"), VALIDITY("VALIDITY"), - TYPE("TYPE_ID"); + TYPE("TYPE_ID"), + SIZE("SIZE"); private final String name; @@ -57,6 +63,7 @@ public String getName() { private static final BufferLayout VALUES_32 = new BufferLayout(BufferType.DATA, 32); private static final BufferLayout VALUES_16 = new BufferLayout(BufferType.DATA, 16); private static final BufferLayout VALUES_8 = new BufferLayout(BufferType.DATA, 8); + private static final BufferLayout SIZE_BUFFER = new BufferLayout(BufferType.SIZE, 32); public static BufferLayout typeBuffer() { return TYPE_BUFFER; @@ -70,6 +77,10 @@ public static BufferLayout largeOffsetBuffer() { return LARGE_OFFSET_BUFFER; } + public static BufferLayout sizeBuffer() { + return SIZE_BUFFER; + } + /** * Returns a databuffer for the given bitwidth. Only supports powers of two between 8 and 128 * inclusive. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 18032528c86d8..ea92efdc55f61 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -101,7 +101,7 @@ public TypeLayout visit(Timestamp type) { } @Override - public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + public TypeLayout visit(ArrowType.List type) { List vectors = asList( BufferLayout.validityVector(), BufferLayout.offsetBuffer() @@ -109,6 +109,16 @@ public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) return new TypeLayout(vectors); } + @Override + public TypeLayout visit(ArrowType.ListView type) { + List vectors = asList( + BufferLayout.validityVector(), + BufferLayout.offsetBuffer(), + BufferLayout.sizeBuffer() + ); + return new TypeLayout(vectors); + } + @Override public TypeLayout visit(ArrowType.LargeList type) { List vectors = asList( @@ -312,11 +322,17 @@ public Integer visit(Timestamp type) { } @Override - public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + public Integer visit(ArrowType.List type) { // validity buffer + offset buffer return 2; } + @Override + public Integer visit(ArrowType.ListView type) { + // validity buffer + offset buffer + size buffer + return 3; + } + @Override public Integer visit(ArrowType.LargeList type) { // validity buffer + offset buffer diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java new file mode 100644 index 0000000000000..73a25738854f3 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java @@ -0,0 +1,405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.util.Collections; +import java.util.Iterator; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.DensityAwareVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; + +public abstract class BaseRepeatedValueViewVector extends BaseValueVector + implements RepeatedValueVector, BaseListVector { + + public static final FieldVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; + public static final String DATA_VECTOR_NAME = "$data$"; + + public static final byte OFFSET_WIDTH = 4; + public static final byte SIZE_WIDTH = 4; + protected ArrowBuf offsetBuffer; + protected ArrowBuf sizeBuffer; + protected FieldVector vector; + protected final CallBack repeatedCallBack; + protected int valueCount; + protected long offsetAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH; + protected long sizeAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * SIZE_WIDTH; + private final String name; + + protected String defaultDataVectorName = DATA_VECTOR_NAME; + + protected BaseRepeatedValueViewVector(String name, BufferAllocator allocator, CallBack callBack) { + this(name, allocator, DEFAULT_DATA_VECTOR, callBack); + } + + protected BaseRepeatedValueViewVector( + String name, BufferAllocator allocator, FieldVector vector, CallBack callBack) { + super(allocator); + this.name = name; + this.offsetBuffer = allocator.getEmpty(); + this.sizeBuffer = allocator.getEmpty(); + this.vector = Preconditions.checkNotNull(vector, "data vector cannot be null"); + this.repeatedCallBack = callBack; + this.valueCount = 0; + } + + @Override + public String getName() { + return name; + } + + @Override + public boolean allocateNewSafe() { + boolean dataAlloc = false; + try { + allocateBuffers(); + dataAlloc = vector.allocateNewSafe(); + } catch (Exception e) { + clear(); + return false; + } finally { + if (!dataAlloc) { + clear(); + } + } + return dataAlloc; + } + + private void allocateBuffers() { + offsetBuffer = allocateBuffers(offsetAllocationSizeInBytes); + sizeBuffer = allocateBuffers(sizeAllocationSizeInBytes); + } + + private ArrowBuf allocateBuffers(final long size) { + final int curSize = (int) size; + ArrowBuf buffer = allocator.buffer(curSize); + buffer.readerIndex(0); + buffer.setZero(0, buffer.capacity()); + return buffer; + } + + @Override + public void reAlloc() { + reallocateBuffers(); + vector.reAlloc(); + } + + protected void reallocateBuffers() { + reallocOffsetBuffer(); + reallocSizeBuffer(); + } + + private void reallocOffsetBuffer() { + final long currentBufferCapacity = offsetBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (offsetAllocationSizeInBytes > 0) { + newAllocationSize = offsetAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + newAllocationSize = Math.min(newAllocationSize, (long) OFFSET_WIDTH * Integer.MAX_VALUE); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE || newAllocationSize <= offsetBuffer.capacity()) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, offsetBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + offsetBuffer.getReferenceManager().release(1); + offsetBuffer = newBuf; + offsetAllocationSizeInBytes = newAllocationSize; + } + + private void reallocSizeBuffer() { + final long currentBufferCapacity = sizeBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (sizeAllocationSizeInBytes > 0) { + newAllocationSize = sizeAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_VALUE_ALLOCATION * SIZE_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + newAllocationSize = Math.min(newAllocationSize, (long) SIZE_WIDTH * Integer.MAX_VALUE); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE || newAllocationSize <= sizeBuffer.capacity()) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, sizeBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + sizeBuffer.getReferenceManager().release(1); + sizeBuffer = newBuf; + sizeAllocationSizeInBytes = newAllocationSize; + } + + @Override + public FieldVector getDataVector() { + return vector; + } + + @Override + public void setInitialCapacity(int numRecords) { + offsetAllocationSizeInBytes = (numRecords) * OFFSET_WIDTH; + sizeAllocationSizeInBytes = (numRecords) * SIZE_WIDTH; + if (vector instanceof BaseFixedWidthVector || vector instanceof BaseVariableWidthVector) { + vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } else { + vector.setInitialCapacity(numRecords); + } + } + + @Override + public void setInitialCapacity(int numRecords, double density) { + if ((numRecords * density) >= Integer.MAX_VALUE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + + offsetAllocationSizeInBytes = numRecords * OFFSET_WIDTH; + sizeAllocationSizeInBytes = numRecords * SIZE_WIDTH; + + int innerValueCapacity = Math.max((int) (numRecords * density), 1); + + if (vector instanceof DensityAwareVector) { + ((DensityAwareVector) vector).setInitialCapacity(innerValueCapacity, density); + } else { + vector.setInitialCapacity(innerValueCapacity); + } + } + + /** + * Specialized version of setInitialTotalCapacity() for ListViewVector. + * This is used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. + * This is very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. + * In such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount, but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param totalNumberOfElements the total number of elements to allow + * for in this vector across all records. + */ + public void setInitialTotalCapacity(int numRecords, int totalNumberOfElements) { + offsetAllocationSizeInBytes = numRecords * OFFSET_WIDTH; + sizeAllocationSizeInBytes = numRecords * SIZE_WIDTH; + vector.setInitialCapacity(totalNumberOfElements); + } + + @Override + public int getValueCapacity() { + throw new UnsupportedOperationException( + "Get value capacity is not supported in RepeatedValueVector"); + } + + protected int getOffsetBufferValueCapacity() { + return capAtMaxInt(offsetBuffer.capacity() / OFFSET_WIDTH); + } + + protected int getSizeBufferValueCapacity() { + return capAtMaxInt(sizeBuffer.capacity() / SIZE_WIDTH); + } + + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + return (valueCount * OFFSET_WIDTH) + (valueCount * SIZE_WIDTH) + vector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + + int innerVectorValueCount = 0; + + for (int i = 0; i < valueCount; i++) { + innerVectorValueCount += sizeBuffer.getInt(i * SIZE_WIDTH); + } + + return (valueCount * OFFSET_WIDTH) + (valueCount * SIZE_WIDTH) + + vector.getBufferSizeFor(innerVectorValueCount); + } + + @Override + public Iterator iterator() { + return Collections.singleton(getDataVector()).iterator(); + } + + @Override + public void clear() { + offsetBuffer = releaseBuffer(offsetBuffer); + sizeBuffer = releaseBuffer(sizeBuffer); + vector.clear(); + valueCount = 0; + super.clear(); + } + + @Override + public void reset() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + sizeBuffer.setZero(0, sizeBuffer.capacity()); + vector.reset(); + valueCount = 0; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return new ArrowBuf[0]; + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getOffsetBufferValueCapacity()) { + reallocateBuffers(); + } + final int childValueCount = valueCount == 0 ? 0 : getLengthOfChildVector(); + vector.setValueCount(childValueCount); + } + + protected int getLengthOfChildVector() { + int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0); + int minOffset = offsetBuffer.getInt(0); + for (int i = 0; i < valueCount; i++) { + int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH); + int currentSum = currentOffset + currentSize; + + maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum); + minOffset = Math.min(minOffset, currentOffset); + } + + return maxOffsetSizeSum - minOffset; + } + + protected int getLengthOfChildVectorByIndex(int index) { + int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0); + int minOffset = offsetBuffer.getInt(0); + for (int i = 0; i < index; i++) { + int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH); + int currentSum = currentOffset + currentSize; + + maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum); + minOffset = Math.min(minOffset, currentOffset); + } + + return maxOffsetSizeSum - minOffset; + } + + /** + * Initialize the data vector (and execute callback) if it hasn't already been done, + * returns the data vector. + */ + public AddOrGetResult addOrGetVector(FieldType fieldType) { + boolean created = false; + if (vector instanceof NullVector) { + vector = fieldType.createNewSingleVector(defaultDataVectorName, allocator, repeatedCallBack); + // returned vector must have the same field + created = true; + if (repeatedCallBack != null && + // not a schema change if changing from ZeroVector to ZeroVector + (fieldType.getType().getTypeID() != ArrowType.ArrowTypeID.Null)) { + repeatedCallBack.doWork(); + } + } + + if (vector.getField().getType().getTypeID() != fieldType.getType().getTypeID()) { + final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", + fieldType.getType().getTypeID(), vector.getField().getType().getTypeID()); + throw new SchemaChangeRuntimeException(msg); + } + + return new AddOrGetResult<>((T) vector, created); + } + + protected void replaceDataVector(FieldVector v) { + vector.clear(); + vector = v; + } + + public abstract boolean isEmpty(int index); + + /** + * Start a new value at the given index. + * @param index the index to start the new value at + * @return the offset in the data vector where the new value starts + */ + public int startNewValue(int index) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + while (index >= getSizeBufferValueCapacity()) { + reallocSizeBuffer(); + } + + if (index > 0) { + final int prevOffset = getLengthOfChildVectorByIndex(index); + offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset); + } + + setValueCount(index + 1); + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + @Override + @Deprecated + public UInt4Vector getOffsetVector() { + throw new UnsupportedOperationException("There is no inner offset vector"); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java new file mode 100644 index 0000000000000..b19691e7aaab7 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java @@ -0,0 +1,872 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static java.util.Collections.singletonList; +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.util.Preconditions.checkArgument; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListViewWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A list view vector contains lists of a specific type of elements. + * Its structure contains four elements. + *

    + *
  1. A validity buffer.
  2. + *
  3. An offset buffer, that denotes lists starts.
  4. + *
  5. A size buffer, that denotes lists ends.
  6. + *
  7. A child data vector that contains the elements of lists.
  8. + *
+ * The latter three are managed by its superclass. + */ + +/* +* TODO: consider merging the functionality in `BaseRepeatedValueVector` into this class. +*/ +public class ListViewVector extends BaseRepeatedValueViewVector implements PromotableVector { + + protected ArrowBuf validityBuffer; + protected UnionListReader reader; + private CallBack callBack; + protected Field field; + protected int validityAllocationSizeInBytes; + + public static ListViewVector empty(String name, BufferAllocator allocator) { + return new ListViewVector(name, allocator, FieldType.nullable(ArrowType.ListView.INSTANCE), null); + } + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param fieldType The type of this list. + * @param callBack A schema change callback. + */ + public ListViewVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + this(new Field(name, fieldType, null), allocator, callBack); + } + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param callBack A schema change callback. + */ + public ListViewVector(Field field, BufferAllocator allocator, CallBack callBack) { + super(field.getName(), allocator, callBack); + this.validityBuffer = allocator.getEmpty(); + this.field = field; + this.callBack = callBack; + this.validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + } + + @Override + public void initializeChildrenFromFields(List children) { + checkArgument(children.size() == 1, + "ListViews have one child Field. Found: %s", children.isEmpty() ? "none" : children); + + Field field = children.get(0); + AddOrGetResult addOrGetVector = addOrGetVector(field.getFieldType()); + checkArgument(addOrGetVector.isCreated(), "Child vector already existed: %s", addOrGetVector.getVector()); + + addOrGetVector.getVector().initializeChildrenFromFields(field.getChildren()); + this.field = new Field(this.field.getName(), this.field.getFieldType(), children); + } + + @Override + public void setInitialCapacity(int numRecords) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialCapacity(numRecords); + } + + /** + * Specialized version of setInitialCapacity() for ListViewVector. + * This is used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. + * This is very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. + * In such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount, but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param density density of ListViewVector. + * Density is the average size of a list per position in the ListViewVector. + * For example, a + * density value of 10 implies each position in the list + * vector has a list of 10 values. + * A density value of 0.1 implies out of 10 positions in + * the list vector, 1 position has a list of size 1, and + * the remaining positions are null (no lists) or empty lists. + * This helps in tightly controlling the memory we provision + * for inner data vector. + */ + @Override + public void setInitialCapacity(int numRecords, double density) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialCapacity(numRecords, density); + } + + /** + * Specialized version of setInitialTotalCapacity() for ListViewVector. + * This is used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. + * This is very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. + * In such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount, but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param totalNumberOfElements the total number of elements to allow + * for in this vector across all records. + */ + @Override + public void setInitialTotalCapacity(int numRecords, int totalNumberOfElements) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialTotalCapacity(numRecords, totalNumberOfElements); + } + + @Override + public List getChildrenFromFields() { + return singletonList(getDataVector()); + } + + /** + * Load the buffers associated with this Field. + * @param fieldNode the fieldNode + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 3) { + throw new IllegalArgumentException("Illegal buffer count, expected " + + 3 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + ArrowBuf szBuffer = ownBuffers.get(2); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator); + sizeBuffer.getReferenceManager().release(); + sizeBuffer = szBuffer.getReferenceManager().retain(szBuffer, allocator); + + validityAllocationSizeInBytes = checkedCastToInt(validityBuffer.capacity()); + offsetAllocationSizeInBytes = offsetBuffer.capacity(); + sizeAllocationSizeInBytes = sizeBuffer.capacity(); + + valueCount = fieldNode.getLength(); + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + sizeBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + sizeBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex(valueCount * OFFSET_WIDTH); + sizeBuffer.writerIndex(valueCount * SIZE_WIDTH); + } + } + + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + result.add(sizeBuffer); + + return result; + } + + /** + * Export the buffers of the fields for C Data Interface. + * This method traverses the buffers and export buffer and buffer's memory address into a list of + * buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + throw new UnsupportedOperationException("exportCDataBuffers Not implemented yet"); + } + + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } + } + + @Override + public boolean allocateNewSafe() { + boolean success = false; + try { + /* release the current buffers, hence this is a new allocation + * Note that, the `clear` method call below is releasing validityBuffer + * calling the superclass clear method which is releasing the associated buffers + * (sizeBuffer and offsetBuffer). + */ + clear(); + /* allocate validity buffer */ + allocateValidityBuffer(validityAllocationSizeInBytes); + /* allocate offset, data and sizes buffer */ + success = super.allocateNewSafe(); + } finally { + if (!success) { + clear(); + } + } + return success; + } + + protected void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + @Override + public void reAlloc() { + /* reallocate the validity buffer */ + reallocValidityBuffer(); + /* reallocate the offset, size, and data */ + super.reAlloc(); + } + + protected void reallocValidityAndSizeAndOffsetBuffers() { + reallocateBuffers(); + reallocValidityBuffer(); + } + + private void reallocValidityBuffer() { + final int currentBufferCapacity = checkedCastToInt(validityBuffer.capacity()); + long newAllocationSize = getNewAllocationSize(currentBufferCapacity); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + validityBuffer.getReferenceManager().release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + private long getNewAllocationSize(int currentBufferCapacity) { + long newAllocationSize = currentBufferCapacity * 2L; + if (newAllocationSize == 0) { + if (validityAllocationSizeInBytes > 0) { + newAllocationSize = validityAllocationSizeInBytes; + } else { + newAllocationSize = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION) * 2L; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + return newAllocationSize; + } + + @Override + public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { + // TODO: https://github.com/apache/arrow/issues/41270 + throw new UnsupportedOperationException( + "ListViewVector does not support copyFromSafe operation yet."); + } + + @Override + public void copyFrom(int inIndex, int outIndex, ValueVector from) { + // TODO: https://github.com/apache/arrow/issues/41270 + throw new UnsupportedOperationException( + "ListViewVector does not support copyFrom operation yet."); + } + + @Override + public FieldVector getDataVector() { + return vector; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + return getTransferPair(field, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + // TODO: https://github.com/apache/arrow/issues/41269 + throw new UnsupportedOperationException( + "ListVector does not support getTransferPair(String, BufferAllocator, CallBack) yet"); + } + + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { + // TODO: https://github.com/apache/arrow/issues/41269 + throw new UnsupportedOperationException( + "ListVector does not support getTransferPair(Field, BufferAllocator, CallBack) yet"); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + // TODO: https://github.com/apache/arrow/issues/41269 + throw new UnsupportedOperationException( + "ListVector does not support makeTransferPair(ValueVector) yet"); + } + + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + public ArrowBuf getSizeBuffer() { + return sizeBuffer; + } + + public long getSizeBufferAddress() { + return sizeBuffer.memoryAddress(); + } + + /** + * Get the hash code for the element at the given index. + * @param index position of the element + * @return hash code for the element at the given index + */ + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + /** + * Get the hash code for the element at the given index. + * @param index position of the element + * @param hasher hasher to use + * @return hash code for the element at the given index + */ + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isSet(index) == 0) { + return ArrowBufPointer.NULL_HASH_CODE; + } + int hash = 0; + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = sizeBuffer.getInt(index * OFFSET_WIDTH); + for (int i = start; i < end; i++) { + hash = ByteFunctionHelpers.combineHash(hash, vector.hashCode(i, hasher)); + } + return hash; + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + throw new UnsupportedOperationException(); + } + + @Override + protected FieldReader getReaderImpl() { + // TODO: https://github.com/apache/arrow/issues/41569 + throw new UnsupportedOperationException( + "ListViewVector does not support getReaderImpl operation yet."); + } + + @Override + public UnionListReader getReader() { + // TODO: https://github.com/apache/arrow/issues/41569 + throw new UnsupportedOperationException( + "ListViewVector does not support getReader operation yet."); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + final int offsetBufferSize = valueCount * OFFSET_WIDTH; + final int sizeBufferSize = valueCount * SIZE_WIDTH; + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + return offsetBufferSize + sizeBufferSize + validityBufferSize + vector.getBufferSize(); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this. + * @param valueCount the number of values to assume this vector contains + * @return size of underlying buffers. + */ + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + + return super.getBufferSizeFor(valueCount) + validityBufferSize; + } + + /** + * Get the field associated with the list view vector. + * @return the field + */ + @Override + public Field getField() { + if (field.getChildren().contains(getDataVector().getField())) { + return field; + } + field = new Field(field.getName(), field.getFieldType(), Collections.singletonList(getDataVector().getField())); + return field; + } + + /** + * Get the minor type for the vector. + * @return the minor type + */ + @Override + public MinorType getMinorType() { + return MinorType.LISTVIEW; + } + + /** + * Clear the vector data. + */ + @Override + public void clear() { + // calling superclass clear method which is releasing the sizeBufer and offsetBuffer + super.clear(); + validityBuffer = releaseBuffer(validityBuffer); + } + + /** + * Release the buffers associated with this vector. + */ + @Override + public void reset() { + super.reset(); + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer, so it only should be used for in-context + * access. Also note that this buffer changes regularly, thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning, the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + List list = new ArrayList<>(); + // the order must be validity, offset and size buffers + list.add(validityBuffer); + list.add(offsetBuffer); + list.add(sizeBuffer); + list.addAll(Arrays.asList(vector.getBuffers(false))); + buffers = list.toArray(new ArrowBuf[list.size()]); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Get the element in the list view vector at a particular index. + * @param index position of the element + * @return Object at given position + */ + @Override + public List getObject(int index) { + if (isSet(index) == 0) { + return null; + } + final List vals = new JsonStringArrayList<>(); + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = start + sizeBuffer.getInt((index) * SIZE_WIDTH); + final ValueVector vv = getDataVector(); + for (int i = start; i < end; i++) { + vals.add(vv.getObject(i)); + } + + return vals; + } + + /** + * Check if an element at given index is null. + * + * @param index position of an element + * @return true if an element at given index is null, false otherwise + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Check if an element at given index is an empty list. + * @param index position of an element + * @return true if an element at given index is an empty list or NULL, false otherwise + */ + @Override + public boolean isEmpty(int index) { + if (isNull(index)) { + return true; + } else { + return sizeBuffer.getInt(index * SIZE_WIDTH) == 0; + } + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of the element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Get the value capacity by considering validity and offset capacity. + * Note that the size buffer capacity is not considered here since it has + * the same capacity as the offset buffer. + * + * @return the value capacity + */ + @Override + public int getValueCapacity() { + return getValidityAndOffsetValueCapacity(); + } + + private int getValidityAndSizeValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity(), 0); + final int sizeValueCapacity = Math.max(getSizeBufferValueCapacity(), 0); + return Math.min(offsetValueCapacity, sizeValueCapacity); + } + + private int getValidityAndOffsetValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity(), 0); + return Math.min(offsetValueCapacity, getValidityBufferValueCapacity()); + } + + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * Set the element at the given index to null. + * @param index the value to change + */ + @Override + public void setNull(int index) { + while (index >= getValidityAndSizeValueCapacity()) { + reallocValidityAndSizeAndOffsetBuffers(); + } + + offsetBuffer.setInt(index * OFFSET_WIDTH, 0); + sizeBuffer.setInt(index * SIZE_WIDTH, 0); + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Start new value in the ListView vector. + * + * @param index index of the value to start + * @return offset of the new value + */ + @Override + public int startNewValue(int index) { + while (index >= getValidityAndSizeValueCapacity()) { + reallocValidityAndSizeAndOffsetBuffers(); + } + + if (index > 0) { + final int prevOffset = getLengthOfChildVectorByIndex(index); + offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset); + } + + BitVectorHelper.setBit(validityBuffer, index); + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + /** + * Validate the invariants of the offset and size buffers. + * 0 <= offsets[i] <= length of the child array + * 0 <= offsets[i] + size[i] <= length of the child array + * @param offset the offset at a given index + * @param size the size at a given index + */ + private void validateInvariants(int offset, int size) { + if (offset < 0) { + throw new IllegalArgumentException("Offset cannot be negative"); + } + + if (size < 0) { + throw new IllegalArgumentException("Size cannot be negative"); + } + + // 0 <= offsets[i] <= length of the child array + if (offset > this.vector.getValueCount()) { + throw new IllegalArgumentException("Offset is out of bounds."); + } + + // 0 <= offsets[i] + size[i] <= length of the child array + if (offset + size > this.vector.getValueCount()) { + throw new IllegalArgumentException("Offset + size <= length of the child array."); + } + } + + /** + * Set the offset at the given index. + * Make sure to use this function after updating `field` vector and using `setValidity` + * @param index index of the value to set + * @param value value to set + */ + public void setOffset(int index, int value) { + validateInvariants(value, sizeBuffer.getInt(index * SIZE_WIDTH)); + + offsetBuffer.setInt(index * OFFSET_WIDTH, value); + } + + /** + * Set the size at the given index. + * Make sure to use this function after using `setOffset`. + * @param index index of the value to set + * @param value value to set + */ + public void setSize(int index, int value) { + validateInvariants(offsetBuffer.getInt(index * SIZE_WIDTH), value); + + sizeBuffer.setInt(index * SIZE_WIDTH, value); + } + + /** + * Set the validity at the given index. + * @param index index of the value to set + * @param value value to set (0 for unset and 1 for a set) + */ + public void setValidity(int index, int value) { + if (value == 0) { + BitVectorHelper.unsetBit(validityBuffer, index); + } else { + BitVectorHelper.setBit(validityBuffer, index); + } + } + + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + if (valueCount > 0) { + while (valueCount > getValidityAndSizeValueCapacity()) { + /* check if validity and offset buffers need to be re-allocated */ + reallocValidityAndSizeAndOffsetBuffers(); + } + } + /* valueCount for the data vector is the current end offset */ + final int childValueCount = (valueCount == 0) ? 0 : getLengthOfChildVector(); + /* set the value count of data vector and this will take care of + * checking whether data buffer needs to be reallocated. + */ + vector.setValueCount(childValueCount); + } + + @Override + public int getElementStartIndex(int index) { + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + @Override + public int getElementEndIndex(int index) { + return sizeBuffer.getInt(index * OFFSET_WIDTH); + } + + @Override + public AddOrGetResult addOrGetVector(FieldType fieldType) { + AddOrGetResult result = super.addOrGetVector(fieldType); + invalidateReader(); + return result; + } + + @Override + public UnionVector promoteToUnion() { + UnionVector vector = new UnionVector("$data$", allocator, /* field type*/ null, callBack); + replaceDataVector(vector); + invalidateReader(); + if (callBack != null) { + callBack.doWork(); + } + return vector; + } + + private void invalidateReader() { + reader = null; + } + + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + public UnionListViewWriter getWriter() { + return new UnionListViewWriter(this); + } + + @Override + public int getValueCount() { + return valueCount; + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final double totalListSize = getLengthOfChildVector(); + return totalListSize / valueCount; + } + + /** + * Validating ListViewVector creation based on the specification guideline. + */ + @Override + public void validate() { + for (int i = 0; i < valueCount; i++) { + final int offset = offsetBuffer.getInt(i * OFFSET_WIDTH); + final int size = sizeBuffer.getInt(i * SIZE_WIDTH); + validateInvariants(offset, size); + } + } + + /** + * End the current value. + * + * @param index index of the value to end + * @param size number of elements in the list that was written + */ + public void endValue(int index, int size) { + sizeBuffer.setInt(index * SIZE_WIDTH, size); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index 7f724829ef1eb..c59b997286d2d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; @@ -54,6 +55,7 @@ public class PromotableWriter extends AbstractPromotableFieldWriter { private final AbstractStructVector parentContainer; private final ListVector listVector; + private final ListViewVector listViewVector; private final FixedSizeListVector fixedListVector; private final LargeListVector largeListVector; private final NullableStructWriterFactory nullableStructWriterFactory; @@ -94,6 +96,7 @@ public PromotableWriter( NullableStructWriterFactory nullableStructWriterFactory) { this.parentContainer = parentContainer; this.listVector = null; + this.listViewVector = null; this.fixedListVector = null; this.largeListVector = null; this.nullableStructWriterFactory = nullableStructWriterFactory; @@ -142,6 +145,27 @@ public PromotableWriter( ListVector listVector, NullableStructWriterFactory nullableStructWriterFactory) { this.listVector = listVector; + this.listViewVector = null; + this.parentContainer = null; + this.fixedListVector = null; + this.largeListVector = null; + this.nullableStructWriterFactory = nullableStructWriterFactory; + init(v); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param listViewVector The vector that serves as a parent of v. + * @param nullableStructWriterFactory The factory to create the delegate writer. + */ + public PromotableWriter( + ValueVector v, + ListViewVector listViewVector, + NullableStructWriterFactory nullableStructWriterFactory) { + this.listViewVector = listViewVector; + this.listVector = null; this.parentContainer = null; this.fixedListVector = null; this.largeListVector = null; @@ -163,6 +187,7 @@ public PromotableWriter( this.fixedListVector = fixedListVector; this.parentContainer = null; this.listVector = null; + this.listViewVector = null; this.largeListVector = null; this.nullableStructWriterFactory = nullableStructWriterFactory; init(v); @@ -183,6 +208,7 @@ public PromotableWriter( this.fixedListVector = null; this.parentContainer = null; this.listVector = null; + this.listViewVector = null; this.nullableStructWriterFactory = nullableStructWriterFactory; init(v); } @@ -280,6 +306,8 @@ protected FieldWriter getWriter(MinorType type, ArrowType arrowType) { v = listVector.addOrGetVector(fieldType).getVector(); } else if (fixedListVector != null) { v = fixedListVector.addOrGetVector(fieldType).getVector(); + } else if (listViewVector != null) { + v = listViewVector.addOrGetVector(fieldType).getVector(); } else { v = largeListVector.addOrGetVector(fieldType).getVector(); } @@ -322,6 +350,8 @@ private FieldWriter promoteToUnion() { unionVector = fixedListVector.promoteToUnion(); } else if (largeListVector != null) { unionVector = largeListVector.promoteToUnion(); + } else if (listViewVector != null) { + unionVector = listViewVector.promoteToUnion(); } unionVector.addVector((FieldVector) tp.getTo()); writer = new UnionWriter(unionVector, nullableStructWriterFactory); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 89d8441d42aa9..e10a65e3b2c53 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -71,6 +71,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; @@ -136,6 +137,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; @@ -692,6 +694,20 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new UnionListWriter((ListVector) vector); } }, + LISTVIEW(ListView.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new ListViewVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionListWriter((ListVector) vector); + } + }, LARGELIST(ArrowType.LargeList.INSTANCE) { @Override public FieldVector getNewVector(Field field, BufferAllocator allocator, CallBack schemaChangeCallback) { @@ -1064,6 +1080,11 @@ public MinorType visit(Duration type) { return MinorType.DURATION; } + @Override + public MinorType visit(ListView type) { + return MinorType.LISTVIEW; + } + @Override public MinorType visit(ExtensionType type) { return MinorType.EXTENSIONTYPE; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java new file mode 100644 index 0000000000000..e64ed77b1eb9f --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java @@ -0,0 +1,1651 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; +import org.apache.arrow.vector.complex.impl.UnionListViewWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.holders.DurationHolder; +import org.apache.arrow.vector.holders.TimeStampMilliTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestListViewVector { + + private BufferAllocator allocator; + + @BeforeEach + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @AfterEach + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testBasicListViewVector() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + /* allocate memory */ + listViewWriter.allocate(); + + /* write the first list at index 0 */ + listViewWriter.setPosition(0); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.bigInt().writeBigInt(-7); + listViewWriter.bigInt().writeBigInt(25); + listViewWriter.endList(); + + /* the second list at index 1 is null (we are not setting any)*/ + + /* write the third list at index 2 */ + listViewWriter.setPosition(2); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(0); + listViewWriter.bigInt().writeBigInt(-127); + listViewWriter.bigInt().writeBigInt(127); + listViewWriter.bigInt().writeBigInt(50); + listViewWriter.endList(); + + /* write the fourth list at index 3 (empty list) */ + listViewWriter.setPosition(3); + listViewWriter.startList(); + listViewWriter.endList(); + + /* write the fifth list at index 4 */ + listViewWriter.setPosition(4); + listViewWriter.startList(); + listViewWriter.bigInt().writeBigInt(1); + listViewWriter.bigInt().writeBigInt(2); + listViewWriter.bigInt().writeBigInt(3); + listViewWriter.bigInt().writeBigInt(4); + listViewWriter.endList(); + + listViewVector.setValueCount(5); + // check value count + assertEquals(5, listViewVector.getValueCount()); + + /* get vector at index 0 -- the value is a BigIntVector*/ + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + final FieldVector dataVec = listViewVector.getDataVector(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check data vector + assertEquals(12, ((BigIntVector) dataVec).get(0)); + assertEquals(-7, ((BigIntVector) dataVec).get(1)); + assertEquals(25, ((BigIntVector) dataVec).get(2)); + assertEquals(0, ((BigIntVector) dataVec).get(3)); + assertEquals(-127, ((BigIntVector) dataVec).get(4)); + assertEquals(127, ((BigIntVector) dataVec).get(5)); + assertEquals(50, ((BigIntVector) dataVec).get(6)); + assertEquals(1, ((BigIntVector) dataVec).get(7)); + assertEquals(2, ((BigIntVector) dataVec).get(8)); + assertEquals(3, ((BigIntVector) dataVec).get(9)); + assertEquals(4, ((BigIntVector) dataVec).get(10)); + + listViewVector.validate(); + } + } + + @Test + public void testImplicitNullVectors() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + /* allocate memory */ + listViewWriter.allocate(); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + /* write the first list at index 0 */ + listViewWriter.setPosition(0); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.bigInt().writeBigInt(-7); + listViewWriter.bigInt().writeBigInt(25); + listViewWriter.endList(); + + int offSet0 = offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size0 = sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH); + + // after the first list is written, + // the initial offset must be 0, + // the size must be 3 (as there are 3 elements in the array), + // the lastSet must be 0 since, the first list is written at index 0. + + assertEquals(0, offSet0); + assertEquals(3, size0); + + listViewWriter.setPosition(5); + listViewWriter.startList(); + + // writing the 6th list at index 5, + // and the list items from index 1 through 4 are not populated. + // but since there is a gap between the 0th and 5th list, in terms + // of buffer allocation, the offset and size buffers must be updated + // to reflect the implicit null vectors. + + for (int i = 1; i < 5; i++) { + int offSet = offSetBuffer.getInt(i * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size = sizeBuffer.getInt(i * BaseRepeatedValueViewVector.SIZE_WIDTH); + // Since the list is not written, the offset and size must equal to child vector's size + // i.e., 3, and size should be 0 as the list is not written. + // And the last set value is the value currently being written, which is 5. + assertEquals(0, offSet); + assertEquals(0, size); + } + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.bigInt().writeBigInt(25); + listViewWriter.endList(); + + int offSet5 = offSetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size5 = sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH); + + assertEquals(3, offSet5); + assertEquals(2, size5); + + listViewWriter.setPosition(10); + listViewWriter.startList(); + + // writing the 11th list at index 10, + // and the list items from index 6 through 10 are not populated. + // but since there is a gap between the 5th and 11th list, in terms + // of buffer allocation, the offset and size buffers must be updated + // to reflect the implicit null vectors. + for (int i = 6; i < 10; i++) { + int offSet = offSetBuffer.getInt(i * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size = sizeBuffer.getInt(i * BaseRepeatedValueViewVector.SIZE_WIDTH); + // Since the list is not written, the offset and size must equal to 0 + // and size should be 0 as the list is not written. + // And the last set value is the value currently being written, which is 10. + assertEquals(0, offSet); + assertEquals(0, size); + } + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.endList(); + + int offSet11 = offSetBuffer.getInt(10 * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size11 = sizeBuffer.getInt(10 * BaseRepeatedValueViewVector.SIZE_WIDTH); + + assertEquals(5, offSet11); + assertEquals(1, size11); + + listViewVector.setValueCount(11); + + listViewVector.validate(); + } + } + + @Test + public void testNestedListViewVector() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + /* allocate memory */ + listViewWriter.allocate(); + + /* the dataVector that backs a listVector will also be a + * listVector for this test. + */ + + /* write one or more inner lists at index 0 */ + listViewWriter.setPosition(0); + listViewWriter.startList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(50); + listViewWriter.list().bigInt().writeBigInt(100); + listViewWriter.list().bigInt().writeBigInt(200); + listViewWriter.list().endList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(75); + listViewWriter.list().bigInt().writeBigInt(125); + listViewWriter.list().bigInt().writeBigInt(150); + listViewWriter.list().bigInt().writeBigInt(175); + listViewWriter.list().endList(); + + listViewWriter.endList(); + + /* write one or more inner lists at index 1 */ + listViewWriter.setPosition(1); + listViewWriter.startList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(10); + listViewWriter.list().endList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(15); + listViewWriter.list().bigInt().writeBigInt(20); + listViewWriter.list().endList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(25); + listViewWriter.list().bigInt().writeBigInt(30); + listViewWriter.list().bigInt().writeBigInt(35); + listViewWriter.list().endList(); + + listViewWriter.endList(); + + listViewVector.setValueCount(2); + + // [[[50,100,200],[75,125,150,175]], [[10],[15,20],[25,30,35]]] + + assertEquals(2, listViewVector.getValueCount()); + + /* get listViewVector value at index 0 -- the value itself is a listViewVector */ + Object result = listViewVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of the second inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(50), list.get(0)); + assertEquals(Long.valueOf(100), list.get(1)); + assertEquals(Long.valueOf(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(75), list.get(0)); + assertEquals(Long.valueOf(125), list.get(1)); + assertEquals(Long.valueOf(150), list.get(2)); + assertEquals(Long.valueOf(175), list.get(3)); + + /* get listViewVector value at index 1 -- the value itself is a listViewVector */ + result = listViewVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of the second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of the third inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(15), list.get(0)); + assertEquals(Long.valueOf(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(Long.valueOf(25), list.get(0)); + assertEquals(Long.valueOf(30), list.get(1)); + assertEquals(Long.valueOf(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listViewVector.isNull(0)); + assertFalse(listViewVector.isNull(1)); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + listViewVector.validate(); + } + } + + @Test + public void testNestedListVector() throws Exception { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + + MinorType listType = MinorType.LISTVIEW; + MinorType scalarType = MinorType.BIGINT; + + listViewVector.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList1 = (ListViewVector) listViewVector.getDataVector(); + innerList1.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList2 = (ListViewVector) innerList1.getDataVector(); + innerList2.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList3 = (ListViewVector) innerList2.getDataVector(); + innerList3.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList4 = (ListViewVector) innerList3.getDataVector(); + innerList4.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList5 = (ListViewVector) innerList4.getDataVector(); + innerList5.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList6 = (ListViewVector) innerList5.getDataVector(); + innerList6.addOrGetVector(FieldType.nullable(scalarType.getType())); + + listViewVector.setInitialCapacity(128); + + listViewVector.validate(); + } + } + + private void setValuesInBuffer(int[] bufValues, ArrowBuf buffer, long bufWidth) { + for (int i = 0; i < bufValues.length; i++) { + buffer.setInt(i * bufWidth, bufValues[i]); + } + } + + /* + * Setting up the buffers directly needs to be validated with the base method used in + * the ListVector class where we use the approach of startList(), + * write to the child vector and endList(). + *

+ * To support this, we have to consider the following scenarios; + *

+ * 1. Only using directly buffer-based inserts. + * 2. Default list insertion followed by buffer-based inserts. + * 3. Buffer-based inserts followed by default list insertion. + */ + + /* Setting up buffers directly would require the following steps to be taken + * 0. Allocate buffers in listViewVector by calling `allocateNew` method. + * 1. Initialize the child vector using `initializeChildrenFromFields` method. + * 2. Set values in the child vector. + * 3. Set validity, offset and size buffers using `setValidity`, + * `setOffset` and `setSize` methods. + * 4. Set value count using `setValueCount` method. + */ + @Test + public void testBasicListViewSet() { + + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + FieldType fieldType = new FieldType(true, new ArrowType.Int(64, true), + null, null); + Field field = new Field("child-vector", fieldType, null); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + BigIntVector childVector = (BigIntVector) fieldVector; + childVector.allocateNew(7); + + childVector.set(0, 12); + childVector.set(1, -7); + childVector.set(2, 25); + childVector.set(3, 0); + childVector.set(4, -127); + childVector.set(5, 127); + childVector.set(6, 50); + + childVector.setValueCount(7); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + listViewVector.setOffset(0, 0); + listViewVector.setOffset(1, 3); + listViewVector.setOffset(2, 3); + listViewVector.setOffset(3, 7); + + listViewVector.setSize(0, 3); + listViewVector.setSize(1, 0); + listViewVector.setSize(2, 4); + listViewVector.setSize(3, 0); + + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 0); + listViewVector.setValidity(2, 1); + listViewVector.setValidity(3, 1); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(4); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check values + assertEquals(12, ((BigIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-7, ((BigIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(25, ((BigIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(0, ((BigIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(-127, ((BigIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(127, ((BigIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(50, ((BigIntVector) listViewVector.getDataVector()).get(6)); + + listViewVector.validate(); + } + } + + @Test + public void testBasicListViewSetNested() { + // Expected listview + // [[[50,100,200],[75,125,150,175]],[[10],[15,20],[25,30,35]]] + + // Setting child vector + // [[50,100,200],[75,125,150,175],[10],[15,20],[25,30,35]] + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + FieldType fieldType = new FieldType(true, new ArrowType.List(), + null, null); + FieldType childFieldType = new FieldType(true, new ArrowType.Int(64, true), + null, null); + Field childField = new Field("child-vector", childFieldType, null); + List children = new ArrayList<>(); + children.add(childField); + Field field = new Field("child-vector", fieldType, children); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + ListVector childVector = (ListVector) fieldVector; + UnionListWriter listWriter = childVector.getWriter(); + listWriter.allocate(); + + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.bigInt().writeBigInt(50); + listWriter.bigInt().writeBigInt(100); + listWriter.bigInt().writeBigInt(200); + + listWriter.endList(); + + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.bigInt().writeBigInt(75); + listWriter.bigInt().writeBigInt(125); + listWriter.bigInt().writeBigInt(150); + listWriter.bigInt().writeBigInt(175); + + listWriter.endList(); + + listWriter.setPosition(2); + listWriter.startList(); + + listWriter.bigInt().writeBigInt(10); + + listWriter.endList(); + + listWriter.startList(); + listWriter.setPosition(3); + + listWriter.bigInt().writeBigInt(15); + listWriter.bigInt().writeBigInt(20); + + listWriter.endList(); + + listWriter.startList(); + listWriter.setPosition(4); + + listWriter.bigInt().writeBigInt(25); + listWriter.bigInt().writeBigInt(30); + listWriter.bigInt().writeBigInt(35); + + listWriter.endList(); + + childVector.setValueCount(5); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 1); + + listViewVector.setOffset(0, 0); + listViewVector.setOffset(1, 2); + + listViewVector.setSize(0, 2); + listViewVector.setSize(1, 3); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(2); + + assertEquals(2, listViewVector.getValueCount()); + + /* get listViewVector value at index 0 -- the value itself is a listViewVector */ + Object result = listViewVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of the second inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(50), list.get(0)); + assertEquals(Long.valueOf(100), list.get(1)); + assertEquals(Long.valueOf(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(75), list.get(0)); + assertEquals(Long.valueOf(125), list.get(1)); + assertEquals(Long.valueOf(150), list.get(2)); + assertEquals(Long.valueOf(175), list.get(3)); + + /* get listViewVector value at index 1 -- the value itself is a listViewVector */ + result = listViewVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of the second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of the third inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(15), list.get(0)); + assertEquals(Long.valueOf(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(Long.valueOf(25), list.get(0)); + assertEquals(Long.valueOf(30), list.get(1)); + assertEquals(Long.valueOf(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listViewVector.isNull(0)); + assertFalse(listViewVector.isNull(1)); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + listViewVector.validate(); + } + } + + @Test + public void testBasicListViewSetWithListViewWriter() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + FieldType fieldType = new FieldType(true, new ArrowType.Int(64, true), + null, null); + Field field = new Field("child-vector", fieldType, null); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + BigIntVector childVector = (BigIntVector) fieldVector; + childVector.allocateNew(7); + + childVector.set(0, 12); + childVector.set(1, -7); + childVector.set(2, 25); + childVector.set(3, 0); + childVector.set(4, -127); + childVector.set(5, 127); + childVector.set(6, 50); + + childVector.setValueCount(7); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 0); + listViewVector.setValidity(2, 1); + listViewVector.setValidity(3, 1); + + listViewVector.setOffset(0, 0); + listViewVector.setOffset(1, 3); + listViewVector.setOffset(2, 3); + listViewVector.setOffset(3, 7); + + listViewVector.setSize(0, 3); + listViewVector.setSize(1, 0); + listViewVector.setSize(2, 4); + listViewVector.setSize(3, 0); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(4); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check values + assertEquals(12, ((BigIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-7, ((BigIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(25, ((BigIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(0, ((BigIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(-127, ((BigIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(127, ((BigIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(50, ((BigIntVector) listViewVector.getDataVector()).get(6)); + + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + listViewWriter.setPosition(4); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(121); + listViewWriter.bigInt().writeBigInt(-71); + listViewWriter.bigInt().writeBigInt(251); + listViewWriter.endList(); + + listViewVector.setValueCount(5); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check values + assertEquals(12, ((BigIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-7, ((BigIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(25, ((BigIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(0, ((BigIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(-127, ((BigIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(127, ((BigIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(50, ((BigIntVector) listViewVector.getDataVector()).get(6)); + assertEquals(121, ((BigIntVector) listViewVector.getDataVector()).get(7)); + assertEquals(-71, ((BigIntVector) listViewVector.getDataVector()).get(8)); + assertEquals(251, ((BigIntVector) listViewVector.getDataVector()).get(9)); + + listViewVector.validate(); + } + } + + @Test + public void testGetBufferAddress() throws Exception { + try (ListViewVector listViewVector = ListViewVector.empty("vector", allocator)) { + + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + boolean error = false; + + listViewWriter.allocate(); + + listViewWriter.setPosition(0); + listViewWriter.startList(); + listViewWriter.bigInt().writeBigInt(50); + listViewWriter.bigInt().writeBigInt(100); + listViewWriter.bigInt().writeBigInt(200); + listViewWriter.endList(); + + listViewWriter.setPosition(1); + listViewWriter.startList(); + listViewWriter.bigInt().writeBigInt(250); + listViewWriter.bigInt().writeBigInt(300); + listViewWriter.endList(); + + listViewVector.setValueCount(2); + + /* check listVector contents */ + Object result = listViewVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(50), resultSet.get(0)); + assertEquals(Long.valueOf(100), resultSet.get(1)); + assertEquals(Long.valueOf(200), resultSet.get(2)); + + result = listViewVector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(250), resultSet.get(0)); + assertEquals(Long.valueOf(300), resultSet.get(1)); + + List buffers = listViewVector.getFieldBuffers(); + + long bitAddress = listViewVector.getValidityBufferAddress(); + long offsetAddress = listViewVector.getOffsetBufferAddress(); + long sizeAddress = listViewVector.getSizeBufferAddress(); + + try { + listViewVector.getDataBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + } + + assertEquals(3, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + assertEquals(sizeAddress, buffers.get(2).memoryAddress()); + + /* (3+2)/2 */ + assertEquals(2.5, listViewVector.getDensity(), 0); + listViewVector.validate(); + } + } + + @Test + public void testConsistentChildName() throws Exception { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + String emptyListStr = listViewVector.getField().toString(); + assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); + + listViewVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + String emptyVectorStr = listViewVector.getField().toString(); + assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); + } + } + + @Test + public void testSetInitialCapacity() { + try (final ListViewVector vector = ListViewVector.empty("", allocator)) { + vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + + vector.setInitialCapacity(512); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512); + + vector.setInitialCapacity(512, 4); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 4); + + vector.setInitialCapacity(512, 0.1); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 51); + + vector.setInitialCapacity(512, 0.01); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 5); + + vector.setInitialCapacity(5, 0.1); + vector.allocateNew(); + assertEquals(8, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 1); + + vector.validate(); + } + } + + @Test + public void testClearAndReuse() { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + BigIntVector bigIntVector = + (BigIntVector) vector.addOrGetVector(FieldType.nullable(MinorType.BIGINT.getType())).getVector(); + vector.setInitialCapacity(10); + vector.allocateNew(); + + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(Long.valueOf(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(Long.valueOf(8), resultSet.get(0)); + + // Clear and release the buffers to trigger a realloc when adding next value + vector.clear(); + + // The list vector should reuse a buffer when reallocating the offset buffer + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + result = vector.getObject(0); + resultSet = (ArrayList) result; + assertEquals(Long.valueOf(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(Long.valueOf(8), resultSet.get(0)); + + vector.validate(); + } + } + + @Test + public void testWriterGetField() { + // adopted from ListVector test cases + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Int(32, true)), null); + Field expectedField = new Field(vector.getName(), FieldType.nullable(ArrowType.ListView.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + + vector.validate(); + } + } + + @Test + public void testWriterUsingHolderGetTimestampMilliTZField() { + // adopted from ListVector test cases + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + org.apache.arrow.vector.complex.writer.FieldWriter writer = vector.getWriter(); + writer.allocate(); + + TimeStampMilliTZHolder holder = new TimeStampMilliTZHolder(); + holder.timezone = "SomeFakeTimeZone"; + writer.startList(); + holder.value = 12341234L; + writer.timeStampMilliTZ().write(holder); + holder.value = 55555L; + writer.timeStampMilliTZ().write(holder); + + // Writing with a different timezone should throw + holder.timezone = "AsdfTimeZone"; + holder.value = 77777; + IllegalArgumentException ex = assertThrows(IllegalArgumentException.class, + () -> writer.timeStampMilliTZ().write(holder)); + assertEquals( + "holder.timezone: AsdfTimeZone not equal to vector timezone: SomeFakeTimeZone", + ex.getMessage()); + + writer.endList(); + vector.setValueCount(1); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "SomeFakeTimeZone")), null); + Field expectedField = new Field(vector.getName(), FieldType.nullable(ArrowType.ListView.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + + vector.validate(); + } + } + + @Test + public void testWriterGetDurationField() { + // adopted from ListVector test cases + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + org.apache.arrow.vector.complex.writer.FieldWriter writer = vector.getWriter(); + writer.allocate(); + + DurationHolder durationHolder = new DurationHolder(); + durationHolder.unit = TimeUnit.MILLISECOND; + + writer.startList(); + durationHolder.value = 812374L; + writer.duration().write(durationHolder); + durationHolder.value = 143451L; + writer.duration().write(durationHolder); + + // Writing with a different unit should throw + durationHolder.unit = TimeUnit.SECOND; + durationHolder.value = 8888888; + IllegalArgumentException ex = assertThrows(IllegalArgumentException.class, + () -> writer.duration().write(durationHolder)); + assertEquals( + "holder.unit: SECOND not equal to vector unit: MILLISECOND", ex.getMessage()); + + writer.endList(); + vector.setValueCount(1); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Duration(TimeUnit.MILLISECOND)), null); + Field expectedField = new Field(vector.getName(), + FieldType.nullable(ArrowType.ListView.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + + vector.validate(); + } + } + + @Test + public void testClose() throws Exception { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + assertTrue(vector.getBufferSize() > 0); + assertTrue(vector.getDataVector().getBufferSize() > 0); + + writer.close(); + assertEquals(0, vector.getBufferSize()); + assertEquals(0, vector.getDataVector().getBufferSize()); + + vector.validate(); + } + } + + @Test + public void testGetBufferSizeFor() { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writeIntValues(writer, new int[] {1, 2}); + writeIntValues(writer, new int[] {3, 4}); + writeIntValues(writer, new int[] {5, 6}); + writeIntValues(writer, new int[] {7, 8, 9, 10}); + writeIntValues(writer, new int[] {11, 12, 13, 14}); + writer.setValueCount(5); + + IntVector dataVector = (IntVector) vector.getDataVector(); + int[] indices = new int[] {0, 2, 4, 6, 10, 14}; + + for (int valueCount = 1; valueCount <= 5; valueCount++) { + int validityBufferSize = BitVectorHelper.getValidityBufferSize(valueCount); + int offsetBufferSize = valueCount * BaseRepeatedValueViewVector.OFFSET_WIDTH; + int sizeBufferSize = valueCount * BaseRepeatedValueViewVector.SIZE_WIDTH; + + int expectedSize = validityBufferSize + offsetBufferSize + sizeBufferSize + + dataVector.getBufferSizeFor(indices[valueCount]); + assertEquals(expectedSize, vector.getBufferSizeFor(valueCount)); + } + vector.validate(); + } + } + + @Test + public void testIsEmpty() { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + // set values [1,2], null, [], [5,6] + writeIntValues(writer, new int[] {1, 2}); + writer.setPosition(2); + writeIntValues(writer, new int[] {}); + writeIntValues(writer, new int[] {5, 6}); + writer.setValueCount(4); + + assertFalse(vector.isEmpty(0)); + assertTrue(vector.isNull(1)); + assertTrue(vector.isEmpty(1)); + assertFalse(vector.isNull(2)); + assertTrue(vector.isEmpty(2)); + assertFalse(vector.isEmpty(3)); + + vector.validate(); + } + } + + @Test + public void testTotalCapacity() { + // adopted from ListVector test cases + final FieldType type = FieldType.nullable(MinorType.INT.getType()); + try (final ListViewVector vector = new ListViewVector("listview", allocator, type, null)) { + // Force the child vector to be allocated based on the type + // (this is a bad API: we have to track and repeat the type twice) + vector.addOrGetVector(type); + + // Specify the allocation size but do not allocate + vector.setInitialTotalCapacity(10, 100); + + // Finally, actually do the allocation + vector.allocateNewSafe(); + + // Note: allocator rounds up and can be greater than the requested allocation. + assertTrue(vector.getValueCapacity() >= 10); + assertTrue(vector.getDataVector().getValueCapacity() >= 100); + + vector.validate(); + } + } + + @Test + public void testSetNull1() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.endList(); + + vector.setNull(1); + + writer.setPosition(2); + writer.startList(); + writer.bigInt().writeBigInt(30); + writer.bigInt().writeBigInt(40); + writer.endList(); + + vector.setNull(3); + vector.setNull(4); + + writer.setPosition(5); + writer.startList(); + writer.bigInt().writeBigInt(50); + writer.bigInt().writeBigInt(60); + writer.endList(); + + vector.setValueCount(6); + + assertFalse(vector.isNull(0)); + assertTrue(vector.isNull(1)); + assertFalse(vector.isNull(2)); + assertTrue(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertFalse(vector.isNull(5)); + + // validate buffers + + final ArrowBuf validityBuffer = vector.getValidityBuffer(); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + assertEquals(1, BitVectorHelper.get(validityBuffer, 0)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 1)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 2)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 3)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 4)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 5)); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(4, offsetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // validate values + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(10), resultSet.get(0)); + assertEquals(Long.valueOf(20), resultSet.get(1)); + + result = vector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(30), resultSet.get(0)); + assertEquals(Long.valueOf(40), resultSet.get(1)); + + result = vector.getObject(5); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(50), resultSet.get(0)); + assertEquals(Long.valueOf(60), resultSet.get(1)); + + vector.validate(); + } + } + + @Test + public void testSetNull2() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + // validate setting nulls first and then writing values + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + vector.setNull(0); + vector.setNull(2); + vector.setNull(4); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(3); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + writer.setPosition(5); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.bigInt().writeBigInt(80); + writer.endList(); + + vector.setValueCount(6); + + assertTrue(vector.isNull(0)); + assertFalse(vector.isNull(1)); + assertTrue(vector.isNull(2)); + assertFalse(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertFalse(vector.isNull(5)); + + // validate buffers + + final ArrowBuf validityBuffer = vector.getValidityBuffer(); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + assertEquals(0, BitVectorHelper.get(validityBuffer, 0)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 1)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 2)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 3)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 4)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 5)); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offsetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // validate values + + Object result = vector.getObject(1); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(10), resultSet.get(0)); + assertEquals(Long.valueOf(20), resultSet.get(1)); + assertEquals(Long.valueOf(30), resultSet.get(2)); + + result = vector.getObject(3); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(40), resultSet.get(0)); + assertEquals(Long.valueOf(50), resultSet.get(1)); + + result = vector.getObject(5); + resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + assertEquals(Long.valueOf(80), resultSet.get(2)); + + vector.validate(); + } + } + + @Test + public void testSetNull3() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + // validate setting values first and then writing nulls + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(3); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + writer.setPosition(5); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.bigInt().writeBigInt(80); + writer.endList(); + + vector.setNull(0); + vector.setNull(2); + vector.setNull(4); + + vector.setValueCount(6); + + assertTrue(vector.isNull(0)); + assertFalse(vector.isNull(1)); + assertTrue(vector.isNull(2)); + assertFalse(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertFalse(vector.isNull(5)); + + // validate buffers + + final ArrowBuf validityBuffer = vector.getValidityBuffer(); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + assertEquals(0, BitVectorHelper.get(validityBuffer, 0)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 1)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 2)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 3)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 4)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 5)); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offsetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // validate values + + Object result = vector.getObject(1); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(10), resultSet.get(0)); + assertEquals(Long.valueOf(20), resultSet.get(1)); + assertEquals(Long.valueOf(30), resultSet.get(2)); + + result = vector.getObject(3); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(40), resultSet.get(0)); + assertEquals(Long.valueOf(50), resultSet.get(1)); + + result = vector.getObject(5); + resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + assertEquals(Long.valueOf(80), resultSet.get(2)); + + vector.validate(); + } + } + + @Test + public void testOverWrite1() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + vector.setValueCount(2); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.endList(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(80); + writer.bigInt().writeBigInt(90); + writer.endList(); + + vector.setValueCount(2); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(80), resultSet.get(0)); + assertEquals(Long.valueOf(90), resultSet.get(1)); + + vector.validate(); + } + } + + @Test + public void testOverwriteWithNull() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + vector.setValueCount(2); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + vector.setNull(0); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + vector.setNull(1); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + assertTrue(vector.isNull(0)); + assertTrue(vector.isNull(1)); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.endList(); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(80); + writer.bigInt().writeBigInt(90); + writer.endList(); + + assertEquals(2, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + vector.setValueCount(2); + + assertFalse(vector.isNull(0)); + assertFalse(vector.isNull(1)); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(80), resultSet.get(0)); + assertEquals(Long.valueOf(90), resultSet.get(1)); + + vector.validate(); + } + } + + @Test + public void testOutOfOrderOffset1() { + // [[12, -7, 25], null, [0, -127, 127, 50], [], [50, 12]] + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + + FieldType fieldType = new FieldType(true, new ArrowType.Int(16, true), + null, null); + Field field = new Field("child-vector", fieldType, null); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + SmallIntVector childVector = (SmallIntVector) fieldVector; + + childVector.allocateNew(7); + + childVector.set(0, 0); + childVector.set(1, -127); + childVector.set(2, 127); + childVector.set(3, 50); + childVector.set(4, 12); + childVector.set(5, -7); + childVector.set(6, 25); + + childVector.setValueCount(7); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 0); + listViewVector.setValidity(2, 1); + listViewVector.setValidity(3, 1); + listViewVector.setValidity(4, 1); + + listViewVector.setOffset(0, 4); + listViewVector.setOffset(1, 7); + listViewVector.setOffset(2, 0); + listViewVector.setOffset(3, 0); + listViewVector.setOffset(4, 3); + + listViewVector.setSize(0, 3); + listViewVector.setSize(1, 0); + listViewVector.setSize(2, 4); + listViewVector.setSize(3, 0); + listViewVector.setSize(4, 2); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(5); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(4, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check child vector + assertEquals(0, ((SmallIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-127, ((SmallIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(127, ((SmallIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(50, ((SmallIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(12, ((SmallIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(-7, ((SmallIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(25, ((SmallIntVector) listViewVector.getDataVector()).get(6)); + + // check values + Object result = listViewVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Short.valueOf("12"), resultSet.get(0)); + assertEquals(Short.valueOf("-7"), resultSet.get(1)); + assertEquals(Short.valueOf("25"), resultSet.get(2)); + + assertTrue(listViewVector.isNull(1)); + + result = listViewVector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(4, resultSet.size()); + assertEquals(Short.valueOf("0"), resultSet.get(0)); + assertEquals(Short.valueOf("-127"), resultSet.get(1)); + assertEquals(Short.valueOf("127"), resultSet.get(2)); + assertEquals(Short.valueOf("50"), resultSet.get(3)); + + assertTrue(listViewVector.isEmpty(3)); + + result = listViewVector.getObject(4); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Short.valueOf("50"), resultSet.get(0)); + assertEquals(Short.valueOf("12"), resultSet.get(1)); + + listViewVector.validate(); + } + } + + private void writeIntValues(UnionListViewWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } + +} From 07a30d9a5784852187d100660325b8c12b4ff6c8 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 16 May 2024 03:30:14 -0800 Subject: [PATCH 104/105] GH-41611: [Docs][CI] Enable most sphinx-lint rules for documentation (#41612) ### Rationale for this change https://github.com/apache/arrow/issues/41611 ### What changes are included in this PR? - Update to pre-commit config to enable all checks except `dangling-hyphen`, `line-too-long` by default - Associated fix docs ### Are these changes tested? Yes, by building and looking at the docs locally. ### Are there any user-facing changes? Just docs. * GitHub Issue: #41611 Authored-by: Bryce Mecum Signed-off-by: AlenkaF --- .pre-commit-config.yaml | 10 +++++-- docs/source/conf.py | 2 +- docs/source/cpp/acero/developer_guide.rst | 10 +++---- docs/source/cpp/acero/overview.rst | 26 +++++++++---------- docs/source/cpp/acero/user_guide.rst | 8 +++--- docs/source/cpp/build_system.rst | 2 +- docs/source/cpp/compute.rst | 18 ++++++------- docs/source/developers/cpp/building.rst | 2 +- docs/source/developers/documentation.rst | 2 +- .../guide/step_by_step/arrow_codebase.rst | 4 +-- .../developers/guide/step_by_step/set_up.rst | 8 +++--- docs/source/developers/java/development.rst | 2 +- docs/source/developers/release.rst | 4 +-- docs/source/format/CanonicalExtensions.rst | 4 +-- docs/source/format/Columnar.rst | 6 ++--- docs/source/format/FlightSql.rst | 2 +- docs/source/format/Integration.rst | 2 +- docs/source/java/algorithm.rst | 2 +- docs/source/java/flight_sql_jdbc_driver.rst | 2 +- docs/source/java/install.rst | 2 +- docs/source/java/ipc.rst | 2 +- docs/source/java/quickstartguide.rst | 16 ++++++------ docs/source/java/substrait.rst | 20 +++++++------- docs/source/java/table.rst | 16 ++++++------ docs/source/python/api/compute.rst | 2 +- docs/source/python/data.rst | 4 +-- docs/source/python/extending_types.rst | 2 +- docs/source/python/filesystems.rst | 4 +-- docs/source/python/install.rst | 2 +- docs/source/python/integration/extending.rst | 2 +- docs/source/python/memory.rst | 2 +- docs/source/python/timestamps.rst | 2 +- 32 files changed, 99 insertions(+), 93 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf5ca08d53c32..7dcc1c9816d12 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -136,5 +136,11 @@ repos: rev: v0.9.1 hooks: - id: sphinx-lint - files: ^docs/ - args: ['--disable', 'all', '--enable', 'trailing-whitespace,missing-final-newline', 'docs'] + files: ^docs/source + exclude: ^docs/source/python/generated + args: [ + '--enable', + 'all', + '--disable', + 'dangling-hyphen,line-too-long', + ] diff --git a/docs/source/conf.py b/docs/source/conf.py index b487200555a09..1e6c113e33188 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -535,7 +535,7 @@ # # latex_appendices = [] -# It false, will not define \strong, \code, itleref, \crossref ... but only +# It false, will not define \strong, \code, \titleref, \crossref ... but only # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added # packages. # diff --git a/docs/source/cpp/acero/developer_guide.rst b/docs/source/cpp/acero/developer_guide.rst index 80ca68556fc40..7dd08fe3ce2ce 100644 --- a/docs/source/cpp/acero/developer_guide.rst +++ b/docs/source/cpp/acero/developer_guide.rst @@ -327,8 +327,8 @@ An engine could choose to create a thread task for every execution of a node. H this leads to problems with cache locality. For example, let's assume we have a basic plan consisting of three exec nodes, scan, project, and then filter (this is a very common use case). Now let's assume there are 100 batches. In a task-per-operator model we would have tasks like "Scan Batch 5", "Project Batch 5", and "Filter Batch 5". Each -of those tasks is potentially going to access the same data. For example, maybe the `project` and `filter` nodes need -to read the same column. A column which is intially created in a decode phase of the `scan` node. To maximize cache +of those tasks is potentially going to access the same data. For example, maybe the ``project`` and ``filter`` nodes need +to read the same column. A column which is intially created in a decode phase of the ``scan`` node. To maximize cache utilization we would need to carefully schedule our tasks to ensure that all three of those tasks are run consecutively and assigned to the same CPU core. @@ -412,7 +412,7 @@ Ordered Execution ================= Some nodes either establish an ordering to their outgoing batches or they need to be able to process batches in order. -Acero handles ordering using the `batch_index` property on an ExecBatch. If a node has a deterministic output order +Acero handles ordering using the ``batch_index`` property on an ExecBatch. If a node has a deterministic output order then it should apply a batch index on batches that it emits. For example, the OrderByNode applies a new ordering to batches (regardless of the incoming ordering). The scan node is able to attach an implicit ordering to batches which reflects the order of the rows in the files being scanned. @@ -461,8 +461,8 @@ Acero's tracing is currently half-implemented and there are major gaps in profil effort at tracing with open telemetry and most of the necessary pieces are in place. The main thing currently lacking is some kind of effective visualization of the tracing results. -In order to use the tracing that is present today you will need to build with Arrow with `ARROW_WITH_OPENTELEMETRY=ON`. -Then you will need to set the environment variable `ARROW_TRACING_BACKEND=otlp_http`. This will configure open telemetry +In order to use the tracing that is present today you will need to build with Arrow with ``ARROW_WITH_OPENTELEMETRY=ON``. +Then you will need to set the environment variable ``ARROW_TRACING_BACKEND=otlp_http``. This will configure open telemetry to export trace results (as OTLP) to the HTTP endpoint http://localhost:4318/v1/traces. You will need to configure an open telemetry collector to collect results on that endpoint and you will need to configure a trace viewer of some kind such as Jaeger: https://www.jaegertracing.io/docs/1.21/opentelemetry/ diff --git a/docs/source/cpp/acero/overview.rst b/docs/source/cpp/acero/overview.rst index 8be4cbc1b1772..34e0b143bc2ce 100644 --- a/docs/source/cpp/acero/overview.rst +++ b/docs/source/cpp/acero/overview.rst @@ -209,16 +209,16 @@ must have the same length. There are a few key differences from ExecBatch: Both the record batch and the exec batch have strong ownership of the arrays & buffers -* An `ExecBatch` does not have a schema. This is because an `ExecBatch` is assumed to be +* An ``ExecBatch`` does not have a schema. This is because an ``ExecBatch`` is assumed to be part of a stream of batches and the stream is assumed to have a consistent schema. So - the schema for an `ExecBatch` is typically stored in the ExecNode. -* Columns in an `ExecBatch` are either an `Array` or a `Scalar`. When a column is a `Scalar` - this means that the column has a single value for every row in the batch. An `ExecBatch` + the schema for an ``ExecBatch`` is typically stored in the ExecNode. +* Columns in an ``ExecBatch`` are either an ``Array`` or a ``Scalar``. When a column is a ``Scalar`` + this means that the column has a single value for every row in the batch. An ``ExecBatch`` also has a length property which describes how many rows are in a batch. So another way to - view a `Scalar` is a constant array with `length` elements. -* An `ExecBatch` contains additional information used by the exec plan. For example, an - `index` can be used to describe a batch's position in an ordered stream. We expect - that `ExecBatch` will also evolve to contain additional fields such as a selection vector. + view a ``Scalar`` is a constant array with ``length`` elements. +* An ``ExecBatch`` contains additional information used by the exec plan. For example, an + ``index`` can be used to describe a batch's position in an ordered stream. We expect + that ``ExecBatch`` will also evolve to contain additional fields such as a selection vector. .. figure:: scalar_vs_array.svg @@ -231,8 +231,8 @@ only zero copy if there are no scalars in the exec batch. .. note:: Both Acero and the compute module have "lightweight" versions of batches and arrays. - In the compute module these are called `BatchSpan`, `ArraySpan`, and `BufferSpan`. In - Acero the concept is called `KeyColumnArray`. These types were developed concurrently + In the compute module these are called ``BatchSpan``, ``ArraySpan``, and ``BufferSpan``. In + Acero the concept is called ``KeyColumnArray``. These types were developed concurrently and serve the same purpose. They aim to provide an array container that can be completely stack allocated (provided the data type is non-nested) in order to avoid heap allocation overhead. Ideally these two concepts will be merged someday. @@ -247,9 +247,9 @@ execution of the nodes. Both ExecPlan and ExecNode are tied to the lifecycle of They have state and are not expected to be restartable. .. warning:: - The structures within Acero, including `ExecBatch`, are still experimental. The `ExecBatch` - class should not be used outside of Acero. Instead, an `ExecBatch` should be converted to - a more standard structure such as a `RecordBatch`. + The structures within Acero, including ``ExecBatch``, are still experimental. The ``ExecBatch`` + class should not be used outside of Acero. Instead, an ``ExecBatch`` should be converted to + a more standard structure such as a ``RecordBatch``. Similarly, an ExecPlan is an internal concept. Users creating plans should be using Declaration objects. APIs for consuming and executing plans should abstract away the details of the underlying diff --git a/docs/source/cpp/acero/user_guide.rst b/docs/source/cpp/acero/user_guide.rst index adcc17216e5ae..0271be2180e99 100644 --- a/docs/source/cpp/acero/user_guide.rst +++ b/docs/source/cpp/acero/user_guide.rst @@ -455,8 +455,8 @@ can be selected from :ref:`this list of aggregation functions will be added which should alleviate this constraint. The aggregation can provide results as a group or scalar. For instances, -an operation like `hash_count` provides the counts per each unique record -as a grouped result while an operation like `sum` provides a single record. +an operation like ``hash_count`` provides the counts per each unique record +as a grouped result while an operation like ``sum`` provides a single record. Scalar Aggregation example: @@ -490,7 +490,7 @@ caller will repeatedly call this function until the generator function is exhaus will accumulate in memory. An execution plan should only have one "terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or an error, before the output is fully consumed. However, the plan can be safely destroyed independently -of the sink, which will hold the unconsumed batches by `exec_plan->finished()`. +of the sink, which will hold the unconsumed batches by ``exec_plan->finished()``. As a part of the Source Example, the Sink operation is also included; @@ -515,7 +515,7 @@ The consuming function may be called before a previous invocation has completed. function does not run quickly enough then many concurrent executions could pile up, blocking the CPU thread pool. The execution plan will not be marked finished until all consuming function callbacks have been completed. -Once all batches have been delivered the execution plan will wait for the `finish` future to complete +Once all batches have been delivered the execution plan will wait for the ``finish`` future to complete before marking the execution plan finished. This allows for workflows where the consumption function converts batches into async tasks (this is currently done internally for the dataset write node). diff --git a/docs/source/cpp/build_system.rst b/docs/source/cpp/build_system.rst index 0c94d7e5ce5dc..e80bca4c949dc 100644 --- a/docs/source/cpp/build_system.rst +++ b/docs/source/cpp/build_system.rst @@ -167,7 +167,7 @@ file into an executable linked with the Arrow C++ shared library: .. code-block:: makefile my_example: my_example.cc - $(CXX) -o $@ $(CXXFLAGS) $< $$(pkg-config --cflags --libs arrow) + $(CXX) -o $@ $(CXXFLAGS) $< $$(pkg-config --cflags --libs arrow) Many build systems support pkg-config. For example: diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 546b6e5716df7..701c7d573ac0e 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -514,8 +514,8 @@ Mixed time resolution temporal inputs will be cast to finest input resolution. +------------+---------------------------------------------+ It's compatible with Redshift's decimal promotion rules. All decimal digits - are preserved for `add`, `subtract` and `multiply` operations. The result - precision of `divide` is at least the sum of precisions of both operands with + are preserved for ``add``, ``subtract`` and ``multiply`` operations. The result + precision of ``divide`` is at least the sum of precisions of both operands with enough scale kept. Error is returned if the result precision is beyond the decimal value range. @@ -1029,7 +1029,7 @@ These functions trim off characters on both sides (trim), or the left (ltrim) or +--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ * \(1) Only characters specified in :member:`TrimOptions::characters` will be - trimmed off. Both the input string and the `characters` argument are + trimmed off. Both the input string and the ``characters`` argument are interpreted as ASCII characters. * \(2) Only trim off ASCII whitespace characters (``'\t'``, ``'\n'``, ``'\v'``, @@ -1570,7 +1570,7 @@ is the same, even though the UTC years would be different. Timezone handling ~~~~~~~~~~~~~~~~~ -`assume_timezone` function is meant to be used when an external system produces +``assume_timezone`` function is meant to be used when an external system produces "timezone-naive" timestamps which need to be converted to "timezone-aware" timestamps (see for example the `definition `__ @@ -1581,11 +1581,11 @@ Input timestamps are assumed to be relative to the timezone given in UTC-relative timestamps with the timezone metadata set to the above value. An error is returned if the timestamps already have the timezone metadata set. -`local_timestamp` function converts UTC-relative timestamps to local "timezone-naive" +``local_timestamp`` function converts UTC-relative timestamps to local "timezone-naive" timestamps. The timezone is taken from the timezone metadata of the input -timestamps. This function is the inverse of `assume_timezone`. Please note: +timestamps. This function is the inverse of ``assume_timezone``. Please note: **all temporal functions already operate on timestamps as if they were in local -time of the metadata provided timezone**. Using `local_timestamp` is only meant to be +time of the metadata provided timezone**. Using ``local_timestamp`` is only meant to be used when an external system expects local timestamps. +-----------------+-------+-------------+---------------+---------------------------------+-------+ @@ -1649,8 +1649,8 @@ overflow is detected. * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running - accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of - input type for `max`, and max of input type for `min`. Specified values of + accumulation. It has a default value of 0 for ``sum``, 1 for ``prod``, min of + input type for ``max``, and max of input type for ``min``. Specified values of ``start`` must be castable to the input type. The second parameter :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 7b80d2138c33e..b052b856c9bd5 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -312,7 +312,7 @@ depends on ``python`` being available). On some Linux distributions, running the test suite might require setting an explicit locale. If you see any locale-related errors, try setting the -environment variable (which requires the `locales` package or equivalent): +environment variable (which requires the ``locales`` package or equivalent): .. code-block:: diff --git a/docs/source/developers/documentation.rst b/docs/source/developers/documentation.rst index 8b1ea28c0f54b..a479065f6297e 100644 --- a/docs/source/developers/documentation.rst +++ b/docs/source/developers/documentation.rst @@ -259,7 +259,7 @@ Build the docs in the target directory: sphinx-build ./source/developers ./source/developers/_build -c ./source -D master_doc=temp_index This builds everything in the target directory to a folder inside of it -called ``_build`` using the config file in the `source` directory. +called ``_build`` using the config file in the ``source`` directory. Once you have verified the HTML documents, you can remove temporary index file: diff --git a/docs/source/developers/guide/step_by_step/arrow_codebase.rst b/docs/source/developers/guide/step_by_step/arrow_codebase.rst index 0beece991b197..0c194ab3a3f70 100644 --- a/docs/source/developers/guide/step_by_step/arrow_codebase.rst +++ b/docs/source/developers/guide/step_by_step/arrow_codebase.rst @@ -99,8 +99,8 @@ can be called from a function in another language. After a function is defined C++ we must create the binding manually to use it in that implementation. .. note:: - There is much you can learn by checking **Pull Requests** - and **unit tests** for similar issues. + There is much you can learn by checking **Pull Requests** + and **unit tests** for similar issues. .. tab-set:: diff --git a/docs/source/developers/guide/step_by_step/set_up.rst b/docs/source/developers/guide/step_by_step/set_up.rst index 9a2177568d6f5..9c808ceee7be6 100644 --- a/docs/source/developers/guide/step_by_step/set_up.rst +++ b/docs/source/developers/guide/step_by_step/set_up.rst @@ -118,10 +118,10 @@ Should give you a result similar to this: .. code:: console - origin https://github.com//arrow.git (fetch) - origin https://github.com//arrow.git (push) - upstream https://github.com/apache/arrow (fetch) - upstream https://github.com/apache/arrow (push) + origin https://github.com//arrow.git (fetch) + origin https://github.com//arrow.git (push) + upstream https://github.com/apache/arrow (fetch) + upstream https://github.com/apache/arrow (push) If you did everything correctly, you should now have a copy of the code in the ``arrow`` directory and two remotes that refer to your own GitHub diff --git a/docs/source/developers/java/development.rst b/docs/source/developers/java/development.rst index 17d47c324ce12..3f0ff6cdd0103 100644 --- a/docs/source/developers/java/development.rst +++ b/docs/source/developers/java/development.rst @@ -118,7 +118,7 @@ This checks the code style of all source code under the current directory or fro $ mvn checkstyle:check -Maven `pom.xml` style is enforced with Spotless using `Apache Maven pom.xml guidelines`_ +Maven ``pom.xml`` style is enforced with Spotless using `Apache Maven pom.xml guidelines`_ You can also just check the style without building the project. This checks the style of all pom.xml files under the current directory or from within an individual module. diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 0b3a83dc5aabe..d903cc71bd5c4 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -106,7 +106,7 @@ If there is consensus and there is a Release Manager willing to take the effort the release a patch release can be created. Committers can tag issues that should be included on the next patch release using the -`backport-candidate` label. Is the responsability of the author or the committer to add the +``backport-candidate`` label. Is the responsability of the author or the committer to add the label to the issue to help the Release Manager identify the issues that should be backported. If a specific issue is identified as the reason to create a patch release the Release Manager @@ -117,7 +117,7 @@ Be sure to go through on the following checklist: #. Create milestone #. Create maintenance branch #. Include issue that was requested as requiring new patch release -#. Add new milestone to issues with `backport-candidate` label +#. Add new milestone to issues with ``backport-candidate`` label #. cherry-pick issues into maintenance branch Creating a Release Candidate diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index c60f095dd354d..c258f889dc6ac 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -77,7 +77,7 @@ Official List Fixed shape tensor ================== -* Extension name: `arrow.fixed_shape_tensor`. +* Extension name: ``arrow.fixed_shape_tensor``. * The storage type of the extension: ``FixedSizeList`` where: @@ -153,7 +153,7 @@ Fixed shape tensor Variable shape tensor ===================== -* Extension name: `arrow.variable_shape_tensor`. +* Extension name: ``arrow.variable_shape_tensor``. * The storage type of the extension is: ``StructArray`` where struct is composed of **data** and **shape** fields describing a single diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index ec6a7fa5e334a..7c853de7829be 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -312,7 +312,7 @@ Each value in this layout consists of 0 or more bytes. While primitive arrays have a single values buffer, variable-size binary have an **offsets** buffer and **data** buffer. -The offsets buffer contains `length + 1` signed integers (either +The offsets buffer contains ``length + 1`` signed integers (either 32-bit or 64-bit, depending on the logical type), which encode the start position of each slot in the data buffer. The length of the value in each slot is computed using the difference between the offset @@ -374,7 +374,7 @@ locations are indicated using a **views** buffer, which may point to one of potentially several **data** buffers or may contain the characters inline. -The views buffer contains `length` view structures with the following layout: +The views buffer contains ``length`` view structures with the following layout: :: @@ -394,7 +394,7 @@ should be interpreted. In the short string case the string's bytes are inlined — stored inside the view itself, in the twelve bytes which follow the length. Any remaining bytes -after the string itself are padded with `0`. +after the string itself are padded with ``0``. In the long string case, a buffer index indicates which data buffer stores the data bytes and an offset indicates where in that buffer the diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index 9c3523755f3ae..b4b85e77a2e5f 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -193,7 +193,7 @@ in the ``app_metadata`` field of the Flight RPC ``PutResult`` returned. When used with DoPut: load the stream of Arrow record batches into the specified target table and return the number of rows ingested - via a `DoPutUpdateResult` message. + via a ``DoPutUpdateResult`` message. Flight Server Session Management -------------------------------- diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index c800255687796..436747989acf3 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -501,7 +501,7 @@ integration testing actually tests. There are two types of integration test cases: the ones populated on the fly by the data generator in the Archery utility, and *gold* files that exist -in the `arrow-testing ` +in the `arrow-testing `_ repository. Data Generator Tests diff --git a/docs/source/java/algorithm.rst b/docs/source/java/algorithm.rst index 06ed32bd48cf7..d4838967d614f 100644 --- a/docs/source/java/algorithm.rst +++ b/docs/source/java/algorithm.rst @@ -82,7 +82,7 @@ for fixed width and variable width vectors, respectively. Both algorithms run in 3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer vector, which correspond to indices of vector elements in sorted order. With the index vector, one can -easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k``th +easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k`` th smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst index cc8822247b007..f95c2ac755d97 100644 --- a/docs/source/java/flight_sql_jdbc_driver.rst +++ b/docs/source/java/flight_sql_jdbc_driver.rst @@ -162,7 +162,7 @@ the Flight SQL service as gRPC headers. For example, the following URI :: This will connect without authentication or encryption, to a Flight SQL service running on ``localhost`` on port 12345. Each request will -also include a `database=mydb` gRPC header. +also include a ``database=mydb`` gRPC header. Connection parameters may also be supplied using the Properties object when using the JDBC Driver Manager to connect. When supplying using diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst index a551edc36c477..dc6a55c87fcd6 100644 --- a/docs/source/java/install.rst +++ b/docs/source/java/install.rst @@ -63,7 +63,7 @@ Modifying the command above for Flight: Otherwise, you may see errors like ``java.lang.IllegalAccessError: superclass access check failed: class org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) cannot access class io.netty.buffer.CompositeByteBuf (in unnamed module ...) because module -org.apache.arrow.flight.core does not read unnamed module ... +org.apache.arrow.flight.core does not read unnamed module ...`` Finally, if you are using arrow-dataset, you'll also need to report that JDK internals need to be exposed. Modifying the command above for arrow-memory: diff --git a/docs/source/java/ipc.rst b/docs/source/java/ipc.rst index 01341ff2cc391..f5939179177d5 100644 --- a/docs/source/java/ipc.rst +++ b/docs/source/java/ipc.rst @@ -81,7 +81,7 @@ Here we used an in-memory stream, but this could have been a socket or some othe writer.end(); Note that, since the :class:`VectorSchemaRoot` in the writer is a container that can hold batches, batches flow through -:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before `writeBatch`, so that later batches +:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before ``writeBatch``, so that later batches could overwrite previous ones. Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches. diff --git a/docs/source/java/quickstartguide.rst b/docs/source/java/quickstartguide.rst index a71ddc5b5e55f..1f3ec861d3f46 100644 --- a/docs/source/java/quickstartguide.rst +++ b/docs/source/java/quickstartguide.rst @@ -195,10 +195,10 @@ Example: Create a dataset of names (strings) and ages (32-bit signed integers). .. code-block:: shell VectorSchemaRoot created: - age name - 10 Dave - 20 Peter - 30 Mary + age name + 10 Dave + 20 Peter + 30 Mary Interprocess Communication (IPC) @@ -306,10 +306,10 @@ Example: Read the dataset from the previous example from an Arrow IPC file (rand Record batches in file: 1 VectorSchemaRoot read: - age name - 10 Dave - 20 Peter - 30 Mary + age name + 10 Dave + 20 Peter + 30 Mary More examples available at `Arrow Java Cookbook`_. diff --git a/docs/source/java/substrait.rst b/docs/source/java/substrait.rst index c5857dcc23f75..fa20dbd61dbfb 100644 --- a/docs/source/java/substrait.rst +++ b/docs/source/java/substrait.rst @@ -100,9 +100,9 @@ Here is an example of a Java program that queries a Parquet file using Java Subs .. code-block:: text // Results example: - FieldPath(0) FieldPath(1) FieldPath(2) FieldPath(3) - 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai - 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon + FieldPath(0) FieldPath(1) FieldPath(2) FieldPath(3) + 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai + 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon Executing Projections and Filters Using Extended Expressions ============================================================ @@ -189,13 +189,13 @@ This Java program: .. code-block:: text - column-1 column-2 - 13 ROMANIA - ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account - 14 SAUDI ARABIA - ts. silent requests haggle. closely express packages sleep across the blithely - 12 VIETNAM - hely enticingly express accounts. even, final - 13 RUSSIA - requests against the platelets use never according to the quickly regular pint - 13 UNITED KINGDOM - eans boost carefully special requests. accounts are. carefull - 11 UNITED STATES - y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be + column-1 column-2 + 13 ROMANIA - ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account + 14 SAUDI ARABIA - ts. silent requests haggle. closely express packages sleep across the blithely + 12 VIETNAM - hely enticingly express accounts. even, final + 13 RUSSIA - requests against the platelets use never according to the quickly regular pint + 13 UNITED KINGDOM - eans boost carefully special requests. accounts are. carefull + 11 UNITED STATES - y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be .. _`Substrait`: https://substrait.io/ .. _`Substrait Java`: https://github.com/substrait-io/substrait-java diff --git a/docs/source/java/table.rst b/docs/source/java/table.rst index 603910f51694f..5aa95e153cea0 100644 --- a/docs/source/java/table.rst +++ b/docs/source/java/table.rst @@ -75,7 +75,7 @@ Tables are created from a ``VectorSchemaRoot`` as shown below. The memory buffer Table t = new Table(someVectorSchemaRoot); -If you now update the vectors held by the ``VectorSchemaRoot`` (using some version of `ValueVector#setSafe()`), it would reflect those changes, but the values in table *t* are unchanged. +If you now update the vectors held by the ``VectorSchemaRoot`` (using some version of ``ValueVector#setSafe()``), it would reflect those changes, but the values in table *t* are unchanged. Creating a Table from FieldVectors ********************************** @@ -243,7 +243,7 @@ It is important to recognize that rows are NOT reified as objects, but rather op Getting a row ************* -Calling `immutableRow()` on any table instance returns a new ``Row`` instance. +Calling ``immutableRow()`` on any table instance returns a new ``Row`` instance. .. code-block:: Java @@ -262,7 +262,7 @@ Since rows are iterable, you can traverse a table using a standard while loop: // do something useful here } -``Table`` implements `Iterable` so you can access rows directly from a table in an enhanced *for* loop: +``Table`` implements ``Iterable`` so you can access rows directly from a table in an enhanced *for* loop: .. code-block:: Java @@ -272,7 +272,7 @@ Since rows are iterable, you can traverse a table using a standard while loop: ... } -Finally, while rows are usually iterated in the order of the underlying data vectors, but they are also positionable using the `Row#setPosition()` method, so you can skip to a specific row. Row numbers are 0-based. +Finally, while rows are usually iterated in the order of the underlying data vectors, but they are also positionable using the ``Row#setPosition()`` method, so you can skip to a specific row. Row numbers are 0-based. .. code-block:: Java @@ -281,7 +281,7 @@ Finally, while rows are usually iterated in the order of the underlying data vec Any changes to position are applied to all the columns in the table. -Note that you must call `next()`, or `setPosition()` before accessing values via a row. Failure to do so results in a runtime exception. +Note that you must call ``next()``, or ``setPosition()`` before accessing values via a row. Failure to do so results in a runtime exception. Read operations using rows ************************** @@ -304,7 +304,7 @@ You can also get value using a nullable ``ValueHolder``. For example: This can be used to retrieve values without creating a new Object for each. -In addition to getting values, you can check if a value is null using `isNull()`. This is important if the vector contains any nulls, as asking for a value from a vector can cause NullPointerExceptions in some cases. +In addition to getting values, you can check if a value is null using ``isNull()``. This is important if the vector contains any nulls, as asking for a value from a vector can cause NullPointerExceptions in some cases. .. code-block:: Java @@ -352,13 +352,13 @@ Working with the C-Data interface The ability to work with native code is required for many Arrow features. This section describes how tables can be be exported for use with native code -Exporting works by converting the data to a ``VectorSchemaRoot`` instance and using the existing facilities to transfer the data. You could do it yourself, but that isn't ideal because conversion to a vector schema root breaks the immutability guarantees. Using the `exportTable()` methods in the `Data`_ class avoids this concern. +Exporting works by converting the data to a ``VectorSchemaRoot`` instance and using the existing facilities to transfer the data. You could do it yourself, but that isn't ideal because conversion to a vector schema root breaks the immutability guarantees. Using the ``exportTable()`` methods in the `Data`_ class avoids this concern. .. code-block:: Java Data.exportTable(bufferAllocator, table, dictionaryProvider, outArrowArray); -If the table contains dictionary-encoded vectors and was constructed with a ``DictionaryProvider``, the provider argument to `exportTable()` can be omitted and the table's provider attribute will be used: +If the table contains dictionary-encoded vectors and was constructed with a ``DictionaryProvider``, the provider argument to ``exportTable()`` can be omitted and the table's provider attribute will be used: .. code-block:: Java diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index f2ac6bd1e1226..5423eebfbab40 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -173,7 +173,7 @@ variants which detect domain errors where appropriate. Comparisons ----------- -These functions expect two inputs of the same type. If one of the inputs is `null` +These functions expect two inputs of the same type. If one of the inputs is ``null`` they return ``null``. .. autosummary:: diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 9156157fcd0c2..f17475138c9a4 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -76,7 +76,7 @@ We use the name **logical type** because the **physical** storage may be the same for one or more types. For example, ``int64``, ``float64``, and ``timestamp[ms]`` all occupy 64 bits per value. -These objects are `metadata`; they are used for describing the data in arrays, +These objects are ``metadata``; they are used for describing the data in arrays, schemas, and record batches. In Python, they can be used in functions where the input data (e.g. Python objects) may be coerced to more than one Arrow type. @@ -99,7 +99,7 @@ types' children. For example, we can define a list of int32 values with: t6 = pa.list_(t1) t6 -A `struct` is a collection of named fields: +A ``struct`` is a collection of named fields: .. ipython:: python diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 8df0ef0b1fe99..83fce84f47c08 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -101,7 +101,7 @@ define the ``__arrow_array__`` method to return an Arrow array:: import pyarrow return pyarrow.array(..., type=type) -The ``__arrow_array__`` method takes an optional `type` keyword which is passed +The ``__arrow_array__`` method takes an optional ``type`` keyword which is passed through from :func:`pyarrow.array`. The method is allowed to return either a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`. diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 22f983a60c349..23d10aaaad720 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -182,7 +182,7 @@ Example how you can read contents from a S3 bucket:: Note that it is important to configure :class:`S3FileSystem` with the correct -region for the bucket being used. If `region` is not set, the AWS SDK will +region for the bucket being used. If ``region`` is not set, the AWS SDK will choose a value, defaulting to 'us-east-1' if the SDK version is <1.8. Otherwise it will try to use a variety of heuristics (environment variables, configuration profile, EC2 metadata server) to resolve the region. @@ -277,7 +277,7 @@ load time, since the library may not be in your LD_LIBRARY_PATH), and relies on some environment variables. * ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has - `lib/native/libhdfs.so`. + ``lib/native/libhdfs.so``. * ``JAVA_HOME``: the location of your Java SDK installation. diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index 4b966e6d2653d..12555c93067f9 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -83,7 +83,7 @@ While Arrow uses the OS-provided timezone database on Linux and macOS, it requir user-provided database on Windows. To download and extract the text version of the IANA timezone database follow the instructions in the C++ :ref:`download-timezone-database` or use pyarrow utility function -`pyarrow.util.download_tzdata_on_windows()` that does the same. +``pyarrow.util.download_tzdata_on_windows()`` that does the same. By default, the timezone database will be detected at ``%USERPROFILE%\Downloads\tzdata``. If the database has been downloaded in a different location, you will need to set diff --git a/docs/source/python/integration/extending.rst b/docs/source/python/integration/extending.rst index b380fea7e902c..d4d099bcf43c8 100644 --- a/docs/source/python/integration/extending.rst +++ b/docs/source/python/integration/extending.rst @@ -474,7 +474,7 @@ Toolchain Compatibility (Linux) The Python wheels for Linux are built using the `PyPA manylinux images `_ which use -the CentOS `devtoolset-9`. In addition to the other notes +the CentOS ``devtoolset-9``. In addition to the other notes above, if you are compiling C++ using these shared libraries, you will need to make sure you use a compatible toolchain as well or you might see a segfault during runtime. diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 23474b923718d..7b49d48ab20fa 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -46,7 +46,7 @@ parent-child relationships. There are many implementations of ``arrow::Buffer``, but they all provide a standard interface: a data pointer and length. This is similar to Python's -built-in `buffer protocol` and ``memoryview`` objects. +built-in ``buffer protocol`` and ``memoryview`` objects. A :class:`Buffer` can be created from any Python object implementing the buffer protocol by calling the :func:`py_buffer` function. Let's consider diff --git a/docs/source/python/timestamps.rst b/docs/source/python/timestamps.rst index cecbd5b595bc7..80a1b7280cbfa 100644 --- a/docs/source/python/timestamps.rst +++ b/docs/source/python/timestamps.rst @@ -24,7 +24,7 @@ Arrow/Pandas Timestamps Arrow timestamps are stored as a 64-bit integer with column metadata to associate a time unit (e.g. milliseconds, microseconds, or nanoseconds), and an -optional time zone. Pandas (`Timestamp`) uses a 64-bit integer representing +optional time zone. Pandas (``Timestamp``) uses a 64-bit integer representing nanoseconds and an optional time zone. Python/Pandas timestamp types without a associated time zone are referred to as "Time Zone Naive". Python/Pandas timestamp types with an associated time zone are From 1c546fb3c130fc6a4f3e06ad31dc49d923785104 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 16 May 2024 14:15:57 +0200 Subject: [PATCH 105/105] GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ (#41494) ### Rationale for this change Currently, when building pyarrow from source, one needs to manually enable the optional components through setting `PYARROW_WITH_...` environment variables. However, we could also make a default choice of components based on which ones where enabled in the Arrow C++ build. ### What changes are included in this PR? Set defaults for the various `PYARROW_BUILD_` based on the `ARROW_` setting. Keep the current `PYARROW_WITH_` environment variables working to allow to override this default. ### Are there any user-facing changes? No * GitHub Issue: #41480 Lead-authored-by: Joris Van den Bossche Co-authored-by: Sutou Kouhei Signed-off-by: Joris Van den Bossche --- ci/appveyor-cpp-build.bat | 1 - python/CMakeLists.txt | 115 +++++++++++++++++++++++--------- python/setup.py | 134 +++++++++++--------------------------- 3 files changed, 123 insertions(+), 127 deletions(-) diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 8cfa67c437264..f688fbb63a9ad 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC% set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% -set PYARROW_WITH_STATIC_BOOST=ON set PYARROW_WITH_SUBSTRAIT=ON set ARROW_HOME=%CONDA_PREFIX%\Library diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 212862357ace2..07acb9e31a731 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -108,25 +108,6 @@ if(UNIX) endif() endif() -# Top level cmake dir -if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") - option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF) - option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) - option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) - option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) - option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF) - option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) - option(PYARROW_BUILD_PARQUET_ENCRYPTION - "Build the PyArrow Parquet encryption integration" OFF) - option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) - option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) - option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) - set(PYARROW_CXXFLAGS - "" - CACHE STRING "Compiler flags to append when compiling Arrow") -endif() - find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND AND NOT CMAKE_C_COMPILER_LAUNCHER @@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}") include(UseCython) -# PyArrow C++ +# Arrow C++ and set default PyArrow build options include(GNUInstallDirs) - find_package(Arrow REQUIRED) +macro(define_option name description arrow_option) + set("PYARROW_${name}" + "AUTO" + CACHE STRING ${description}) + + if("${PYARROW_${name}}" STREQUAL "AUTO") + # by default, first check if env variable exists, otherwise use Arrow C++ config + set(env_variable "PYARROW_WITH_${name}") + if(DEFINED ENV{${env_variable}}) + if($ENV{${env_variable}}) + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + else() + if(${arrow_option}) + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + endif() + else() + if("${PYARROW_${name}}") + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + endif() +endmacro() + +define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO) +define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA) +define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET) +define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT) +define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA) +define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC) +define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET) +define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption integration" + PARQUET_REQUIRE_ENCRYPTION) +define_option(SUBSTRAIT "Build the PyArrow Substrait integration" ARROW_SUBSTRAIT) +define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE) +define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS) +define_option(S3 "Build the PyArrow S3 integration" ARROW_S3) +define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS) +option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) +option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) +option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) +set(PYARROW_CXXFLAGS + "" + CACHE STRING "Compiler flags to append when compiling PyArrow C++") + +# enforce module dependencies +if(PYARROW_BUILD_SUBSTRAIT) + set(PYARROW_BUILD_DATASET ON) +endif() +if(PYARROW_BUILD_DATASET) + set(PYARROW_BUILD_ACERO ON) +endif() + +# PyArrow C++ set(PYARROW_CPP_ROOT_DIR pyarrow/src) set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python) set(PYARROW_CPP_SRCS @@ -305,6 +345,7 @@ set(PYARROW_CPP_LINK_LIBS "") # Check all the options from Arrow and PyArrow C++ to be in line if(PYARROW_BUILD_DATASET) + message(STATUS "Building PyArrow with Dataset") if(NOT ARROW_DATASET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON") endif() @@ -317,6 +358,7 @@ if(PYARROW_BUILD_DATASET) endif() if(PYARROW_BUILD_ACERO) + message(STATUS "Building PyArrow with Acero") if(NOT ARROW_ACERO) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON") endif() @@ -329,18 +371,13 @@ if(PYARROW_BUILD_ACERO) endif() if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Building PyArrow with Parquet") if(NOT ARROW_PARQUET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON") endif() find_package(Parquet REQUIRED) endif() -if(PYARROW_BUILD_HDFS) - if(NOT ARROW_HDFS) - message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") - endif() -endif() - # Check for only Arrow C++ options if(ARROW_CSV) list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc) @@ -400,6 +437,7 @@ endif() set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) if(PYARROW_BUILD_FLIGHT) + message(STATUS "Building PyArrow with Flight") if(NOT ARROW_FLIGHT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON") endif() @@ -555,23 +593,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE) set(LINK_LIBS arrow_python) if(PYARROW_BUILD_AZURE) + message(STATUS "Building PyArrow with Azure") + if(NOT ARROW_AZURE) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON") + endif() list(APPEND CYTHON_EXTENSIONS _azurefs) endif() if(PYARROW_BUILD_GCS) + message(STATUS "Building PyArrow with GCS") + if(NOT ARROW_GCS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _gcsfs) endif() if(PYARROW_BUILD_S3) + message(STATUS "Building PyArrow with S3") + if(NOT ARROW_S3) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON") + endif() list(APPEND CYTHON_EXTENSIONS _s3fs) endif() if(PYARROW_BUILD_HDFS) + message(STATUS "Building PyArrow with HDFS") + if(NOT ARROW_HDFS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _hdfs) endif() if(PYARROW_BUILD_CUDA) - # Arrow CUDA + message(STATUS "Building PyArrow with CUDA") if(NOT ARROW_CUDA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON") endif() @@ -646,8 +700,9 @@ if(PYARROW_BUILD_PARQUET) endif() endif() +# ORC if(PYARROW_BUILD_ORC) - # ORC + message(STATUS "Building PyArrow with ORC") if(NOT ARROW_ORC) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON") endif() @@ -679,6 +734,7 @@ endif() # Substrait if(PYARROW_BUILD_SUBSTRAIT) + message(STATUS "Building PyArrow with Substrait") if(NOT ARROW_SUBSTRAIT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON") endif() @@ -696,6 +752,7 @@ endif() # Gandiva if(PYARROW_BUILD_GANDIVA) + message(STATUS "Building PyArrow with Gandiva") if(NOT ARROW_GANDIVA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON") endif() diff --git a/python/setup.py b/python/setup.py index 6f3dddb29d248..ed2b7961e5fbb 100755 --- a/python/setup.py +++ b/python/setup.py @@ -152,32 +152,20 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' - self.with_azure = strtobool( - os.environ.get('PYARROW_WITH_AZURE', '0')) - self.with_gcs = strtobool( - os.environ.get('PYARROW_WITH_GCS', '0')) - self.with_s3 = strtobool( - os.environ.get('PYARROW_WITH_S3', '0')) - self.with_hdfs = strtobool( - os.environ.get('PYARROW_WITH_HDFS', '0')) - self.with_cuda = strtobool( - os.environ.get('PYARROW_WITH_CUDA', '0')) - self.with_substrait = strtobool( - os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) - self.with_flight = strtobool( - os.environ.get('PYARROW_WITH_FLIGHT', '0')) - self.with_acero = strtobool( - os.environ.get('PYARROW_WITH_ACERO', '0')) - self.with_dataset = strtobool( - os.environ.get('PYARROW_WITH_DATASET', '0')) - self.with_parquet = strtobool( - os.environ.get('PYARROW_WITH_PARQUET', '0')) - self.with_parquet_encryption = strtobool( - os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0')) - self.with_orc = strtobool( - os.environ.get('PYARROW_WITH_ORC', '0')) - self.with_gandiva = strtobool( - os.environ.get('PYARROW_WITH_GANDIVA', '0')) + self.with_azure = None + self.with_gcs = None + self.with_s3 = None + self.with_hdfs = None + self.with_cuda = None + self.with_substrait = None + self.with_flight = None + self.with_acero = None + self.with_dataset = None + self.with_parquet = None + self.with_parquet_encryption = None + self.with_orc = None + self.with_gandiva = None + self.generate_coverage = strtobool( os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) self.bundle_arrow_cpp = strtobool( @@ -185,15 +173,6 @@ def initialize_options(self): self.bundle_cython_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) - self.with_parquet_encryption = (self.with_parquet_encryption and - self.with_parquet) - - # enforce module dependencies - if self.with_substrait: - self.with_dataset = True - if self.with_dataset: - self.with_acero = True - CYTHON_MODULE_NAMES = [ 'lib', '_fs', @@ -270,23 +249,30 @@ def append_cmake_bool(value, varname): cmake_options.append('-D{0}={1}'.format( varname, 'on' if value else 'off')) + def append_cmake_component(flag, varname): + # only pass this to cmake is the user pass the --with-component + # flag to setup.py build_ext + if flag is not None: + append_cmake_bool(flag, varname) + if self.cmake_generator: cmake_options += ['-G', self.cmake_generator] - append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') - append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT') - append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') - append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') - append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO') - append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') - append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') - append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') - append_cmake_bool(self.with_parquet_encryption, - 'PYARROW_BUILD_PARQUET_ENCRYPTION') - append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') - append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') - append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') - append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') + append_cmake_component(self.with_cuda, 'PYARROW_CUDA') + append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT') + append_cmake_component(self.with_flight, 'PYARROW_FLIGHT') + append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA') + append_cmake_component(self.with_acero, 'PYARROW_ACERO') + append_cmake_component(self.with_dataset, 'PYARROW_DATASET') + append_cmake_component(self.with_orc, 'PYARROW_ORC') + append_cmake_component(self.with_parquet, 'PYARROW_PARQUET') + append_cmake_component(self.with_parquet_encryption, + 'PYARROW_PARQUET_ENCRYPTION') + append_cmake_component(self.with_azure, 'PYARROW_AZURE') + append_cmake_component(self.with_gcs, 'PYARROW_GCS') + append_cmake_component(self.with_s3, 'PYARROW_S3') + append_cmake_component(self.with_hdfs, 'PYARROW_HDFS') + append_cmake_bool(self.bundle_arrow_cpp, 'PYARROW_BUNDLE_ARROW_CPP') append_cmake_bool(self.bundle_cython_cpp, @@ -329,54 +315,8 @@ def append_cmake_bool(value, varname): self._found_names = [] for name in self.CYTHON_MODULE_NAMES: built_path = pjoin(install_prefix, name + ext_suffix) - if not os.path.exists(built_path): - print(f'Did not find {built_path}') - if self._failure_permitted(name): - print(f'Cython module {name} failure permitted') - continue - raise RuntimeError('PyArrow C-extension failed to build:', - os.path.abspath(built_path)) - - self._found_names.append(name) - - def _failure_permitted(self, name): - if name == '_parquet' and not self.with_parquet: - return True - if name == '_parquet_encryption' and not self.with_parquet_encryption: - return True - if name == '_orc' and not self.with_orc: - return True - if name == '_flight' and not self.with_flight: - return True - if name == '_substrait' and not self.with_substrait: - return True - if name == '_azurefs' and not self.with_azure: - return True - if name == '_gcsfs' and not self.with_gcs: - return True - if name == '_s3fs' and not self.with_s3: - return True - if name == '_hdfs' and not self.with_hdfs: - return True - if name == '_dataset' and not self.with_dataset: - return True - if name == '_acero' and not self.with_acero: - return True - if name == '_exec_plan' and not self.with_acero: - return True - if name == '_dataset_orc' and not ( - self.with_orc and self.with_dataset - ): - return True - if name == '_dataset_parquet' and not ( - self.with_parquet and self.with_dataset - ): - return True - if name == '_cuda' and not self.with_cuda: - return True - if name == 'gandiva' and not self.with_gandiva: - return True - return False + if os.path.exists(built_path): + self._found_names.append(name) def _get_build_dir(self): # Get the package directory from build_py