From 178d168e29334d4d35563827cfb14cc6fa0cc076 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Fri, 24 Feb 2023 14:19:23 -0500 Subject: [PATCH] Implement `relationship` and refine join match warning (#6753) * Implement `relationship` to refine join checks * NEWS bullet * Update to `-` rather than `_` * Simplify `relationship` options By removing `"none"` and `"warn-many-to-many"`, which are covered by `NULL` * NEWS bullet tweak --- NEWS.md | 35 +++++ R/join-rows.R | 210 ++++++++++++++++++++++++----- R/join.R | 132 ++++++++++++++---- man/mutate-joins.Rd | 122 +++++++++++++---- tests/testthat/_snaps/join-rows.md | 189 ++++++++++++++++++++++---- tests/testthat/_snaps/join.md | 17 +-- tests/testthat/test-group-by.R | 4 +- tests/testthat/test-join-rows.R | 110 +++++++++++++-- tests/testthat/test-join.R | 47 +++---- 9 files changed, 705 insertions(+), 161 deletions(-) diff --git a/NEWS.md b/NEWS.md index 149a8360f2..c6c6bff5ec 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,40 @@ # dplyr (development version) +* Mutating joins now warn about multiple matches much less often. At a high + level, a warning was previously being thrown when a one-to-many or + many-to-many relationship was detected between the keys of `x` and `y`, but is + now only thrown for a many-to-many relationship, which is much rarer and much + more dangerous than one-to-many because it can result in a Cartesian explosion + in the number of rows returned from the join (#6731, #6717). + + We've accomplished this in two steps: + + * `multiple` now defaults to `"all"`, and the options of `"error"` and + `"warning"` are now deprecated in favor of using `relationship` (see below). + We are using an accelerated deprecation process for these two options + because they've only been available for a few weeks, and `relationship` is + a clearly superior alternative. + + * The mutating joins gain a new `relationship` argument, allowing you to + optionally enforce one of the following relationship constraints between the + keys of `x` and `y`: `"one-to-one"`, `"one-to-many"`, `"many-to-one"`, or + `"many-to-many"`. + + For example, `"many-to-one"` enforces that each row in `x` can match at + most 1 row in `y`. If a row in `x` matches >1 rows in `y`, an error is + thrown. This option serves as the replacement for `multiple = "error"`. + + The default behavior of `relationship` doesn't assume that there is any + relationship between `x` and `y`. However, for equality joins it will check + for the presence of a many-to-many relationship, and will warn if it detects + one. + + This change unfortunately does mean that if you have set `multiple = "all"` to + avoid a warning and you happened to be doing a many-to-many style join, then + you will need to replace `multiple = "all"` with + `relationship = "many-to-many"` to silence the new warning, but we believe + this should be rare since many-to-many relationships are fairly uncommon. + * `pick()` now returns a 1 row, 0 column tibble when `...` evaluates to an empty selection. This makes it more compatible with [tidyverse recycling rules](https://vctrs.r-lib.org/reference/vector_recycling_rules.html) in some diff --git a/R/join-rows.R b/R/join-rows.R index d8ea6c869d..5596bdfa5a 100644 --- a/R/join-rows.R +++ b/R/join-rows.R @@ -6,8 +6,9 @@ join_rows <- function(x_key, condition = "==", filter = "none", cross = FALSE, - multiple = NULL, + multiple = "all", unmatched = "drop", + relationship = NULL, error_call = caller_env(), user_env = caller_env()) { check_dots_empty0(...) @@ -22,6 +23,16 @@ join_rows <- function(x_key, x_unmatched <- unmatched$x y_unmatched <- unmatched$y + # TODO: Remove this when `multiple = NULL / "error" / "warning"` is defunct + if (is_null(multiple)) { + warn_join_multiple_null(user_env = user_env) + multiple <- "all" + } else if (is_string(multiple, "error")) { + warn_join_multiple("error", user_env = user_env) + } else if (is_string(multiple, "warning")) { + warn_join_multiple("warning", user_env = user_env) + } + if (cross) { # TODO: Remove this section when `by = character()` is defunct @@ -33,17 +44,17 @@ join_rows <- function(x_key, condition <- "==" filter <- "none" + } - if (is_null(multiple)) { - # If user supplied `multiple`, that wins. Otherwise expect multiple. - multiple <- "all" - } + if (is_null(relationship)) { + relationship <- compute_join_relationship(type, condition, cross, user_env = user_env) + } else { + relationship <- check_join_relationship(relationship, error_call = error_call) } incomplete <- standardise_join_incomplete(type, na_matches, x_unmatched) no_match <- standardise_join_no_match(type, x_unmatched) remaining <- standardise_join_remaining(type, y_unmatched) - multiple <- standardise_multiple(multiple, condition, filter, user_env) matches <- dplyr_locate_matches( needles = x_key, @@ -54,6 +65,7 @@ join_rows <- function(x_key, no_match = no_match, remaining = remaining, multiple = multiple, + relationship = relationship, needles_arg = "x", haystack_arg = "y", error_call = error_call @@ -71,6 +83,7 @@ dplyr_locate_matches <- function(needles, no_match = NA_integer_, remaining = "drop", multiple = "all", + relationship = "none", needles_arg = "", haystack_arg = "", error_call = caller_env()) { @@ -86,6 +99,7 @@ dplyr_locate_matches <- function(needles, no_match = no_match, remaining = remaining, multiple = multiple, + relationship = relationship, needles_arg = needles_arg, haystack_arg = haystack_arg, nan_distinct = TRUE @@ -102,6 +116,18 @@ dplyr_locate_matches <- function(needles, vctrs_error_matches_remaining = function(cnd) { rethrow_error_join_matches_remaining(cnd, error_call) }, + vctrs_error_matches_relationship_one_to_one = function(cnd) { + rethrow_error_join_relationship_one_to_one(cnd, error_call) + }, + vctrs_error_matches_relationship_one_to_many = function(cnd) { + rethrow_error_join_relationship_one_to_many(cnd, error_call) + }, + vctrs_error_matches_relationship_many_to_one = function(cnd) { + rethrow_error_join_relationship_many_to_one(cnd, error_call) + }, + vctrs_warning_matches_relationship_many_to_many = function(cnd) { + rethrow_warning_join_relationship_many_to_many(cnd, error_call) + }, vctrs_error_matches_multiple = function(cnd) { rethrow_error_join_matches_multiple(cnd, error_call) }, @@ -144,14 +170,74 @@ rethrow_error_join_matches_remaining <- function(cnd, call) { ) } -rethrow_error_join_matches_multiple <- function(cnd, call) { +rethrow_error_join_relationship_one_to_one <- function(cnd, call) { i <- cnd$i + which <- cnd$which - stop_join( + if (which == "needles") { + x_name <- "x" + y_name <- "y" + } else { + x_name <- "y" + y_name <- "x" + } + + stop_join_matches_multiple( + i = i, + x_name = x_name, + y_name = y_name, + class = "dplyr_error_join_relationship_one_to_one", + call = call + ) +} + +rethrow_error_join_relationship_one_to_many <- function(cnd, call) { + stop_join_matches_multiple( + i = cnd$i, + x_name = "y", + y_name = "x", + class = "dplyr_error_join_relationship_one_to_many", + call = call + ) +} + +rethrow_error_join_relationship_many_to_one <- function(cnd, call) { + stop_join_matches_multiple( + i = cnd$i, + x_name = "x", + y_name = "y", + class = "dplyr_error_join_relationship_many_to_one", + call = call + ) +} + +rethrow_warning_join_relationship_many_to_many <- function(cnd, call) { + i <- cnd$i + j <- cnd$j + + warn_join( message = c( - glue("Each row in `x` must match at most 1 row in `y`."), - i = glue("Row {i} of `x` matches multiple rows.") + "Detected an unexpected many-to-many relationship between `x` and `y`.", + i = glue("Row {i} of `x` matches multiple rows in `y`."), + i = glue("Row {j} of `y` matches multiple rows in `x`."), + i = paste0( + "If a many-to-many relationship is expected, ", + "set `relationship = \"many-to-many\"` to silence this warning." + ) ), + class = "dplyr_warning_join_relationship_many_to_many", + call = call + ) + + # Cancel `cnd` + maybe_restart("muffleWarning") +} + +rethrow_error_join_matches_multiple <- function(cnd, call) { + stop_join_matches_multiple( + i = cnd$i, + x_name = "x", + y_name = "y", class = "dplyr_error_join_matches_multiple", call = call ) @@ -163,11 +249,7 @@ rethrow_warning_join_matches_multiple <- function(cnd, call) { warn_join( message = c( glue("Each row in `x` is expected to match at most 1 row in `y`."), - i = glue("Row {i} of `x` matches multiple rows."), - i = paste0( - "If multiple matches are expected, set `multiple = \"all\"` ", - "to silence this warning." - ) + i = glue("Row {i} of `x` matches multiple rows.") ), class = "dplyr_warning_join_matches_multiple", call = call @@ -177,6 +259,17 @@ rethrow_warning_join_matches_multiple <- function(cnd, call) { maybe_restart("muffleWarning") } +stop_join_matches_multiple <- function(i, x_name, y_name, class, call) { + stop_join( + message = c( + glue("Each row in `{x_name}` must match at most 1 row in `{y_name}`."), + i = glue("Row {i} of `{x_name}` matches multiple rows in `{y_name}`.") + ), + class = class, + call = call + ) +} + stop_join <- function(message = NULL, class = NULL, ..., call = caller_env()) { stop_dplyr(message = message, class = c(class, "dplyr_error_join"), ..., call = call) } @@ -273,30 +366,83 @@ standardise_join_remaining <- function(type, y_unmatched) { } } -standardise_multiple <- function(multiple, condition, filter, user_env) { - if (!is_null(multiple)) { - # User supplied value always wins - return(multiple) +compute_join_relationship <- function(type, condition, cross, user_env = caller_env(2)) { + if (type == "nest") { + # Not unreasonable to see a many-to-many relationship here, but it can't + # result in a Cartesian explosion in the result so we don't check for it + return("none") } - # "warning" for equi and rolling joins where multiple matches are surprising. - # "all" for non-equi joins where multiple matches are expected. - non_equi <- (condition != "==") & (filter == "none") - if (any(non_equi)) { - return("all") + if (type %in% c("semi", "anti")) { + # Impossible to generate a many-to-many relationship here because we set + # `multiple = "any"` + return("none") } - if (is_direct(user_env)) { - # Direct calls result in a warning when there are multiple matches, - # because they are likely surprising and the caller will be able to set - # the `multiple` argument. Indirect calls don't warn, because the caller - # is unlikely to have access to `multiple` to silence it. - "warning" - } else { - "all" + if (cross) { + # TODO: Remove when `by = character()` is defunct + # Cross-joins always result in many-to-many relationships + return("none") } + + any_inequality <- any(condition != "==") + + if (any_inequality) { + # We only check for a many-to-many relationship when doing an equality join, + # because that is where it is typically unexpected. + # - Inequality and overlap joins often generate many-to-many relationships + # by nature + # - Rolling joins are a little trickier, but we've decided that not warning + # is probably easier to explain. `relationship = "many-to-one"` can always + # be used explicitly as needed. + return("none") + } + + if (!is_direct(user_env)) { + # Indirect calls don't warn, because the caller is unlikely to have access + # to `relationship` to silence it + return("none") + } + + "warn-many-to-many" } +check_join_relationship <- function(relationship, error_call = caller_env()) { + arg_match0( + arg = relationship, + values = c("one-to-one", "one-to-many", "many-to-one", "many-to-many"), + error_call = error_call + ) +} + +# ------------------------------------------------------------------------------ + +warn_join_multiple <- function(what, user_env = caller_env(2)) { + what <- glue::glue('Specifying `multiple = "{what}"`') + + lifecycle::deprecate_warn( + when = "1.1.1", + what = I(what), + with = I('`relationship = "many-to-one"`'), + user_env = user_env, + always = TRUE + ) +} + +warn_join_multiple_null <- function(user_env = caller_env(2)) { + # Only really needed in case people wrapped `left_join()` and friends and + # passed the old default of `NULL` through + lifecycle::deprecate_warn( + when = "1.1.1", + what = I("Specifying `multiple = NULL`"), + with = I('`multiple = "all"`'), + user_env = user_env, + always = TRUE + ) +} + +# ------------------------------------------------------------------------------ + # TODO: Use upstream function when exported from rlang # `lifecycle:::is_direct()` is_direct <- function(env) { diff --git a/R/join.R b/R/join.R index 2eae09a0c0..315484c4a8 100644 --- a/R/join.R +++ b/R/join.R @@ -26,14 +26,41 @@ #' #' * A `full_join()` keeps all observations in `x` and `y`. #' -#' ## Multiple matches +#' @section Many-to-many relationships: #' -#' By default, if an observation in `x` matches multiple observations in `y`, -#' all of the matching observations in `y` will be returned. If this occurs in -#' an equality join or a rolling join, a warning will be thrown stating that -#' multiple matches have been detected since this is usually surprising. If -#' multiple matches are expected in these cases, silence this warning by -#' explicitly setting `multiple = "all"`. +#' By default, dplyr guards against many-to-many relationships in equality joins +#' by throwing a warning. These occur when both of the following are true: +#' +#' - A row in `x` matches multiple rows in `y`. +#' - A row in `y` matches multiple rows in `x`. +#' +#' This is typically surprising, as most joins involve a relationship of +#' one-to-one, one-to-many, or many-to-one, and is often the result of an +#' improperly specified join. Many-to-many relationships are particularly +#' problematic because they can result in a Cartesian explosion of the number of +#' rows returned from the join. +#' +#' If a many-to-many relationship is expected, silence this warning by +#' explicitly setting `relationship = "many-to-many"`. +#' +#' In production code, it is best to preemptively set `relationship` to whatever +#' relationship you expect to exist between the keys of `x` and `y`, as this +#' forces an error to occur immediately if the data doesn't align with your +#' expectations. +#' +#' Inequality joins typically result in many-to-many relationships by nature, so +#' they don't warn on them by default, but you should still take extra care when +#' specifying an inequality join, because they also have the capability to +#' return a large number of rows. +#' +#' Rolling joins don't warn on many-to-many relationships either, but many +#' rolling joins follow a many-to-one relationship, so it is often useful to +#' set `relationship = "many-to-one"` to enforce this. +#' +#' Note that in SQL, most database providers won't let you specify a +#' many-to-many relationship between two tables, instead requiring that you +#' create a third _junction table_ that results in two one-to-many relationships +#' instead. #' #' @return #' An object of the same type as `x` (including the same groups). The order of @@ -119,21 +146,13 @@ #' for database sources and to `base::merge(incomparables = NA)`. #' @param multiple Handling of rows in `x` with multiple matches in `y`. #' For each row of `x`: -#' - `"all"` returns every match detected in `y`. This is the same behavior -#' as SQL. +#' - `"all"`, the default, returns every match detected in `y`. This is the +#' same behavior as SQL. #' - `"any"` returns one match detected in `y`, with no guarantees on which #' match will be returned. It is often faster than `"first"` and `"last"` #' if you just need to detect if there is at least one match. #' - `"first"` returns the first match detected in `y`. #' - `"last"` returns the last match detected in `y`. -#' - `"warning"` throws a warning if multiple matches are detected, and -#' then falls back to `"all"`. -#' - `"error"` throws an error if multiple matches are detected. -#' -#' The default value of `NULL` is equivalent to `"warning"` for equality joins -#' and rolling joins, where multiple matches are usually surprising. If any -#' inequality join conditions are present, then it is equivalent to `"all"`, -#' since multiple matches are usually expected. #' @param unmatched How should unmatched keys that would result in dropped rows #' be handled? #' - `"drop"` drops unmatched keys from the result. @@ -147,6 +166,34 @@ #' - For inner joins, it checks both `x` and `y`. In this case, `unmatched` is #' also allowed to be a character vector of length 2 to specify the behavior #' for `x` and `y` independently. +#' @param relationship Handling of the expected relationship between the keys of +#' `x` and `y`. If the expectations chosen from the list below are +#' invalidated, an error is thrown. +#' +#' - `NULL`, the default, doesn't expect there to be any relationship between +#' `x` and `y`. However, for equality joins it will check for a many-to-many +#' relationship (which is typically unexpected) and will warn if one occurs, +#' encouraging you to either take a closer look at your inputs or make this +#' relationship explicit by specifying `"many-to-many"`. +#' +#' See the _Many-to-many relationships_ section for more details. +#' +#' - `"one-to-one"` expects: +#' - Each row in `x` matches at most 1 row in `y`. +#' - Each row in `y` matches at most 1 row in `x`. +#' +#' - `"one-to-many"` expects: +#' - Each row in `y` matches at most 1 row in `x`. +#' +#' - `"many-to-one"` expects: +#' - Each row in `x` matches at most 1 row in `y`. +#' +#' - `"many-to-many"` doesn't perform any relationship checks, but is provided +#' to allow you to be explicit about this relationship if you know it +#' exists. +#' +#' `relationship` doesn't handle cases where there are zero matches. For that, +#' see `unmatched`. #' @family joins #' @examples #' band_members %>% inner_join(band_instruments) @@ -166,14 +213,21 @@ #' full_join(band_instruments2, by = join_by(name == artist), keep = TRUE) #' #' # If a row in `x` matches multiple rows in `y`, all the rows in `y` will be -#' # returned once for each matching row in `x`, with a warning. +#' # returned once for each matching row in `x`. #' df1 <- tibble(x = 1:3) #' df2 <- tibble(x = c(1, 1, 2), y = c("first", "second", "third")) #' df1 %>% left_join(df2) #' -#' # If multiple matches are expected, set `multiple` to `"all"` to silence -#' # the warning -#' df1 %>% left_join(df2, multiple = "all") +#' # If a row in `y` also matches multiple rows in `x`, this is known as a +#' # many-to-many relationship, which is typically a result of an improperly +#' # specified join or some kind of messy data. In this case, a warning is +#' # thrown by default: +#' df3 <- tibble(x = c(1, 1, 1, 3)) +#' df3 %>% left_join(df2) +#' +#' # In the rare case where a many-to-many relationship is expected, set +#' # `relationship = "many-to-many"` to silence this warning +#' df3 %>% left_join(df2, relationship = "many-to-many") #' #' # Use `join_by()` with a condition other than `==` to perform an inequality #' # join. Here we match on every instance where `df1$x > df2$x`. @@ -214,8 +268,9 @@ inner_join.data.frame <- function(x, ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL, - unmatched = "drop") { + multiple = "all", + unmatched = "drop", + relationship = NULL) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( @@ -228,6 +283,7 @@ inner_join.data.frame <- function(x, keep = keep, multiple = multiple, unmatched = unmatched, + relationship = relationship, user_env = caller_env() ) } @@ -254,8 +310,9 @@ left_join.data.frame <- function(x, ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL, - unmatched = "drop") { + multiple = "all", + unmatched = "drop", + relationship = NULL) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( @@ -268,6 +325,7 @@ left_join.data.frame <- function(x, keep = keep, multiple = multiple, unmatched = unmatched, + relationship = relationship, user_env = caller_env() ) } @@ -294,8 +352,9 @@ right_join.data.frame <- function(x, ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL, - unmatched = "drop") { + multiple = "all", + unmatched = "drop", + relationship = NULL) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( @@ -308,6 +367,7 @@ right_join.data.frame <- function(x, keep = keep, multiple = multiple, unmatched = unmatched, + relationship = relationship, user_env = caller_env() ) } @@ -334,7 +394,8 @@ full_join.data.frame <- function(x, ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL) { + multiple = "all", + relationship = NULL) { check_dots_empty0(...) y <- auto_copy(x, y, copy = copy) join_mutate( @@ -348,6 +409,7 @@ full_join.data.frame <- function(x, multiple = multiple, # All keys from both inputs are retained. Erroring never makes sense. unmatched = "drop", + relationship = relationship, user_env = caller_env() ) } @@ -538,6 +600,10 @@ nest_join.data.frame <- function(x, # row of `x` (#6392). multiple <- "all" + # Will be set to `"none"` in `join_rows()`. Because we can't have a Cartesian + # explosion, we don't care about many-to-many relationships. + relationship <- NULL + rows <- join_rows( x_key = x_key, y_key = y_key, @@ -548,6 +614,7 @@ nest_join.data.frame <- function(x, cross = cross, multiple = multiple, unmatched = unmatched, + relationship = relationship, user_env = caller_env() ) @@ -577,8 +644,9 @@ join_mutate <- function(x, suffix = c(".x", ".y"), na_matches = "na", keep = NULL, - multiple = NULL, + multiple = "all", unmatched = "drop", + relationship = NULL, error_call = caller_env(), user_env = caller_env()) { check_dots_empty0(...) @@ -635,6 +703,7 @@ join_mutate <- function(x, cross = cross, multiple = multiple, unmatched = unmatched, + relationship = relationship, error_call = error_call, user_env = user_env ) @@ -725,6 +794,10 @@ join_filter <- function(x, # We only care about whether or not any matches exist multiple <- "any" + # Will be set to `"none"` in `join_rows()`. Because `multiple = "any"`, that + # means many-to-many relationships aren't possible. + relationship <- NULL + # Since we are actually testing the presence of matches, it doesn't make # sense to ever error on unmatched values. unmatched <- "drop" @@ -739,6 +812,7 @@ join_filter <- function(x, cross = cross, multiple = multiple, unmatched = unmatched, + relationship = relationship, error_call = error_call, user_env = user_env ) diff --git a/man/mutate-joins.Rd b/man/mutate-joins.Rd index 6f2846097f..74717bdf10 100644 --- a/man/mutate-joins.Rd +++ b/man/mutate-joins.Rd @@ -33,8 +33,9 @@ inner_join( ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL, - unmatched = "drop" + multiple = "all", + unmatched = "drop", + relationship = NULL ) left_join( @@ -56,8 +57,9 @@ left_join( ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL, - unmatched = "drop" + multiple = "all", + unmatched = "drop", + relationship = NULL ) right_join( @@ -79,8 +81,9 @@ right_join( ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL, - unmatched = "drop" + multiple = "all", + unmatched = "drop", + relationship = NULL ) full_join( @@ -102,7 +105,8 @@ full_join( ..., keep = NULL, na_matches = c("na", "never"), - multiple = NULL + multiple = "all", + relationship = NULL ) } \arguments{ @@ -174,22 +178,14 @@ for database sources and to \code{base::merge(incomparables = NA)}. \item{multiple}{Handling of rows in \code{x} with multiple matches in \code{y}. For each row of \code{x}: \itemize{ -\item \code{"all"} returns every match detected in \code{y}. This is the same behavior -as SQL. +\item \code{"all"}, the default, returns every match detected in \code{y}. This is the +same behavior as SQL. \item \code{"any"} returns one match detected in \code{y}, with no guarantees on which match will be returned. It is often faster than \code{"first"} and \code{"last"} if you just need to detect if there is at least one match. \item \code{"first"} returns the first match detected in \code{y}. \item \code{"last"} returns the last match detected in \code{y}. -\item \code{"warning"} throws a warning if multiple matches are detected, and -then falls back to \code{"all"}. -\item \code{"error"} throws an error if multiple matches are detected. -} - -The default value of \code{NULL} is equivalent to \code{"warning"} for equality joins -and rolling joins, where multiple matches are usually surprising. If any -inequality join conditions are present, then it is equivalent to \code{"all"}, -since multiple matches are usually expected.} +}} \item{unmatched}{How should unmatched keys that would result in dropped rows be handled? @@ -208,6 +204,38 @@ potentially drop rows. also allowed to be a character vector of length 2 to specify the behavior for \code{x} and \code{y} independently. }} + +\item{relationship}{Handling of the expected relationship between the keys of +\code{x} and \code{y}. If the expectations chosen from the list below are +invalidated, an error is thrown. +\itemize{ +\item \code{NULL}, the default, doesn't expect there to be any relationship between +\code{x} and \code{y}. However, for equality joins it will check for a many-to-many +relationship (which is typically unexpected) and will warn if one occurs, +encouraging you to either take a closer look at your inputs or make this +relationship explicit by specifying \code{"many-to-many"}. + +See the \emph{Many-to-many relationships} section for more details. +\item \code{"one-to-one"} expects: +\itemize{ +\item Each row in \code{x} matches at most 1 row in \code{y}. +\item Each row in \code{y} matches at most 1 row in \code{x}. +} +\item \code{"one-to-many"} expects: +\itemize{ +\item Each row in \code{y} matches at most 1 row in \code{x}. +} +\item \code{"many-to-one"} expects: +\itemize{ +\item Each row in \code{x} matches at most 1 row in \code{y}. +} +\item \code{"many-to-many"} doesn't perform any relationship checks, but is provided +to allow you to be explicit about this relationship if you know it +exists. +} + +\code{relationship} doesn't handle cases where there are zero matches. For that, +see \code{unmatched}.} } \value{ An object of the same type as \code{x} (including the same groups). The order of @@ -255,17 +283,46 @@ data frames: \item A \code{full_join()} keeps all observations in \code{x} and \code{y}. } } +} +\section{Many-to-many relationships}{ -\subsection{Multiple matches}{ -By default, if an observation in \code{x} matches multiple observations in \code{y}, -all of the matching observations in \code{y} will be returned. If this occurs in -an equality join or a rolling join, a warning will be thrown stating that -multiple matches have been detected since this is usually surprising. If -multiple matches are expected in these cases, silence this warning by -explicitly setting \code{multiple = "all"}. +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. } + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. } + \section{Methods}{ These functions are \strong{generic}s, which means that packages can provide @@ -299,14 +356,21 @@ band_members \%>\% full_join(band_instruments2, by = join_by(name == artist), keep = TRUE) # If a row in `x` matches multiple rows in `y`, all the rows in `y` will be -# returned once for each matching row in `x`, with a warning. +# returned once for each matching row in `x`. df1 <- tibble(x = 1:3) df2 <- tibble(x = c(1, 1, 2), y = c("first", "second", "third")) df1 \%>\% left_join(df2) -# If multiple matches are expected, set `multiple` to `"all"` to silence -# the warning -df1 \%>\% left_join(df2, multiple = "all") +# If a row in `y` also matches multiple rows in `x`, this is known as a +# many-to-many relationship, which is typically a result of an improperly +# specified join or some kind of messy data. In this case, a warning is +# thrown by default: +df3 <- tibble(x = c(1, 1, 1, 3)) +df3 \%>\% left_join(df2) + +# In the rare case where a many-to-many relationship is expected, set +# `relationship = "many-to-many"` to silence this warning +df3 \%>\% left_join(df2, relationship = "many-to-many") # Use `join_by()` with a condition other than `==` to perform an inequality # join. Here we match on every instance where `df1$x > df2$x`. diff --git a/tests/testthat/_snaps/join-rows.md b/tests/testthat/_snaps/join-rows.md index 09c5447ced..7920500f97 100644 --- a/tests/testthat/_snaps/join-rows.md +++ b/tests/testthat/_snaps/join-rows.md @@ -1,22 +1,13 @@ -# `multiple` default behavior is correct +# `relationship` default behavior is correct Code out <- join_rows(c(1, 1), c(1, 1), condition = "==") Condition Warning: - Each row in `x` is expected to match at most 1 row in `y`. - i Row 1 of `x` matches multiple rows. - i If multiple matches are expected, set `multiple = "all"` to silence this warning. - ---- - - Code - out <- join_rows(c(1, 1), c(1, 1), condition = ">=", filter = "max") - Condition - Warning: - Each row in `x` is expected to match at most 1 row in `y`. - i Row 1 of `x` matches multiple rows. - i If multiple matches are expected, set `multiple = "all"` to silence this warning. + Detected an unexpected many-to-many relationship between `x` and `y`. + i Row 1 of `x` matches multiple rows in `y`. + i Row 1 of `y` matches multiple rows in `x`. + i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. # join_rows() allows `unmatched` to be specified independently for inner joins @@ -38,33 +29,76 @@ i This is an internal error that was detected in the dplyr package. Please report it at with a reprex () and the full backtrace. -# join_rows() gives meaningful error/warning message on multiple matches +# join_rows() gives meaningful one-to-one errors Code - join_rows(1, c(1, 1), multiple = "error") + join_rows(1, c(1, 1), relationship = "one-to-one") Condition Error: ! Each row in `x` must match at most 1 row in `y`. - i Row 1 of `x` matches multiple rows. + i Row 1 of `x` matches multiple rows in `y`. --- Code - cat(conditionMessage(cnd)) + join_rows(c(1, 1), 1, relationship = "one-to-one") + Condition + Error: + ! Each row in `y` must match at most 1 row in `x`. + i Row 1 of `y` matches multiple rows in `x`. + +# join_rows() gives meaningful one-to-many errors + + Code + join_rows(c(1, 1), 1, relationship = "one-to-many") + Condition + Error: + ! Each row in `y` must match at most 1 row in `x`. + i Row 1 of `y` matches multiple rows in `x`. + +# join_rows() gives meaningful many-to-one errors + + Code + join_rows(1, c(1, 1), relationship = "many-to-one") + Condition + Error: + ! Each row in `x` must match at most 1 row in `y`. + i Row 1 of `x` matches multiple rows in `y`. + +# join_rows() gives meaningful many-to-many warnings + + Code + join_rows(c(1, 1), c(1, 1)) + Condition + Warning: + Detected an unexpected many-to-many relationship between `x` and `y`. + i Row 1 of `x` matches multiple rows in `y`. + i Row 1 of `y` matches multiple rows in `x`. + i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. Output - Each row in `x` is expected to match at most 1 row in `y`. - i Row 1 of `x` matches multiple rows. - i If multiple matches are expected, set `multiple = "all"` to silence this warning. + $x + [1] 1 1 2 2 + + $y + [1] 1 2 1 2 + --- Code - . <- left_join(df1, df2, by = "x") + left_join(df, df, by = join_by(x)) Condition Warning in `left_join()`: - Each row in `x` is expected to match at most 1 row in `y`. - i Row 1 of `x` matches multiple rows. - i If multiple matches are expected, set `multiple = "all"` to silence this warning. + Detected an unexpected many-to-many relationship between `x` and `y`. + i Row 1 of `x` matches multiple rows in `y`. + i Row 1 of `y` matches multiple rows in `x`. + i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. + Output + x + 1 1 + 2 1 + 3 1 + 4 1 # join_rows() gives meaningful error message on unmatched rows @@ -285,3 +319,108 @@ ! `unmatched` must be one of "drop" or "error", not "dr". i Did you mean "drop"? +# join_rows() validates `relationship` + + Code + join_rows(df, df, relationship = 1) + Condition + Error: + ! `relationship` must be a string or character vector. + +--- + + Code + join_rows(df, df, relationship = "none") + Condition + Error: + ! `relationship` must be one of "one-to-one", "one-to-many", "many-to-one", or "many-to-many", not "none". + +--- + + Code + join_rows(df, df, relationship = "warn-many-to-many") + Condition + Error: + ! `relationship` must be one of "one-to-one", "one-to-many", "many-to-one", or "many-to-many", not "warn-many-to-many". + i Did you mean "many-to-many"? + +# `multiple = NULL` is deprecated and results in `'all'` (#6731) + + Code + out <- join_rows(df1, df2, multiple = NULL) + Condition + Warning: + Specifying `multiple = NULL` was deprecated in dplyr 1.1.1. + i Please use `multiple = "all"` instead. + +--- + + Code + left_join(df1, df2, by = join_by(x), multiple = NULL) + Condition + Warning: + Specifying `multiple = NULL` was deprecated in dplyr 1.1.1. + i Please use `multiple = "all"` instead. + Output + # A tibble: 3 x 1 + x + + 1 1 + 2 2 + 3 2 + +# `multiple = 'error'` is deprecated (#6731) + + Code + join_rows(df1, df2, multiple = "error") + Condition + Warning: + Specifying `multiple = "error"` was deprecated in dplyr 1.1.1. + i Please use `relationship = "many-to-one"` instead. + Error: + ! Each row in `x` must match at most 1 row in `y`. + i Row 2 of `x` matches multiple rows in `y`. + +--- + + Code + left_join(df1, df2, by = join_by(x), multiple = "error") + Condition + Warning: + Specifying `multiple = "error"` was deprecated in dplyr 1.1.1. + i Please use `relationship = "many-to-one"` instead. + Error in `left_join()`: + ! Each row in `x` must match at most 1 row in `y`. + i Row 2 of `x` matches multiple rows in `y`. + +# `multiple = 'warning'` is deprecated (#6731) + + Code + out <- join_rows(df1, df2, multiple = "warning") + Condition + Warning: + Specifying `multiple = "warning"` was deprecated in dplyr 1.1.1. + i Please use `relationship = "many-to-one"` instead. + Warning: + Each row in `x` is expected to match at most 1 row in `y`. + i Row 2 of `x` matches multiple rows. + +--- + + Code + left_join(df1, df2, by = join_by(x), multiple = "warning") + Condition + Warning: + Specifying `multiple = "warning"` was deprecated in dplyr 1.1.1. + i Please use `relationship = "many-to-one"` instead. + Warning in `left_join()`: + Each row in `x` is expected to match at most 1 row in `y`. + i Row 2 of `x` matches multiple rows. + Output + # A tibble: 3 x 1 + x + + 1 1 + 2 2 + 3 2 + diff --git a/tests/testthat/_snaps/join.md b/tests/testthat/_snaps/join.md index 6faadc8573..6b852f7302 100644 --- a/tests/testthat/_snaps/join.md +++ b/tests/testthat/_snaps/join.md @@ -50,15 +50,16 @@ Error: ! `na_matches` must be one of "na" or "never", not "foo". -# mutating joins trigger multiple match warning +# mutating joins trigger many-to-many warning Code - out <- left_join(df1, df2, join_by(x)) + out <- left_join(df, df, join_by(x)) Condition Warning in `left_join()`: - Each row in `x` is expected to match at most 1 row in `y`. - i Row 1 of `x` matches multiple rows. - i If multiple matches are expected, set `multiple = "all"` to silence this warning. + Detected an unexpected many-to-many relationship between `x` and `y`. + i Row 1 of `x` matches multiple rows in `y`. + i Row 1 of `y` matches multiple rows in `x`. + i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. # mutating joins compute common columns @@ -198,14 +199,14 @@ ! Each row of `y` must be matched by `x`. i Row 1 of `y` was not matched. -# `by = character()` technically respects `multiple` +# `by = character()` technically respects `relationship` Code - left_join(df, df, by = character(), multiple = "error") + left_join(df, df, by = character(), relationship = "many-to-one") Condition Error in `left_join()`: ! Each row in `x` must match at most 1 row in `y`. - i Row 1 of `x` matches multiple rows. + i Row 1 of `x` matches multiple rows in `y`. # `by = character()` for a cross join is deprecated (#6604) diff --git a/tests/testthat/test-group-by.R b/tests/testthat/test-group-by.R index 84e8c1f505..df640c5ee9 100644 --- a/tests/testthat/test-group-by.R +++ b/tests/testthat/test-group-by.R @@ -55,8 +55,8 @@ test_that("joins preserve grouping", { df <- data.frame(x = rep(1:2, each = 4), y = rep(1:4, each = 2)) g <- group_by(df, x) - expect_equal(group_vars(inner_join(g, g, by = c("x", "y"), multiple = "all")), "x") - expect_equal(group_vars(left_join(g, g, by = c("x", "y"), multiple = "all")), "x") + expect_equal(group_vars(inner_join(g, g, by = c("x", "y"), relationship = "many-to-many")), "x") + expect_equal(group_vars(left_join(g, g, by = c("x", "y"), relationship = "many-to-many")), "x") expect_equal(group_vars(semi_join(g, g, by = c("x", "y"))), "x") expect_equal(group_vars(anti_join(g, g, by = c("x", "y"))), "x") }) diff --git a/tests/testthat/test-join-rows.R b/tests/testthat/test-join-rows.R index 385f1c197a..2c4a651067 100644 --- a/tests/testthat/test-join-rows.R +++ b/tests/testthat/test-join-rows.R @@ -1,18 +1,24 @@ -test_that("`multiple` default behavior is correct", { - # "warning" for equi and rolling joins +test_that("`relationship` default behavior is correct", { + # "warn-many-to-many" for equality joins expect_snapshot(out <- join_rows(c(1, 1), c(1, 1), condition = "==")) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) - expect_snapshot(out <- join_rows(c(1, 1), c(1, 1), condition = ">=", filter = "max")) + # "none" for rolling joins + expect_warning(out <- join_rows(c(1, 2), c(1, 1), condition = ">=", filter = "max"), NA) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) + # If rolling joins warned on many-to-many relationships, it would be a little + # hard to explain that the above example warns, but this wouldn't just because + # we've removed `2` as a key from `x`: + # `join_rows(1, c(1, 1), condition = ">=", filter = "max")` - # "all" for non-equi and cross joins + # "none" for inequality joins (and overlap joins) expect_warning(out <- join_rows(c(1, 2), c(0, 1), condition = ">="), NA) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) + # "none" for deprecated cross joins expect_warning(out <- join_rows(c(1, 1), c(1, 1), cross = TRUE), NA) expect_equal(out$x, c(1L, 1L, 2L, 2L)) expect_equal(out$y, c(1L, 2L, 1L, 2L)) @@ -162,15 +168,37 @@ test_that("join_rows() expects incompatible type errors to have been handled by }) }) -test_that("join_rows() gives meaningful error/warning message on multiple matches", { - expect_snapshot(error = TRUE, join_rows(1, c(1, 1), multiple = "error")) +test_that("join_rows() gives meaningful one-to-one errors", { + expect_snapshot(error = TRUE, { + join_rows(1, c(1, 1), relationship = "one-to-one") + }) + expect_snapshot(error = TRUE, { + join_rows(c(1, 1), 1, relationship = "one-to-one") + }) +}) + +test_that("join_rows() gives meaningful one-to-many errors", { + expect_snapshot(error = TRUE, { + join_rows(c(1, 1), 1, relationship = "one-to-many") + }) +}) + +test_that("join_rows() gives meaningful many-to-one errors", { + expect_snapshot(error = TRUE, { + join_rows(1, c(1, 1), relationship = "many-to-one") + }) +}) - cnd <- catch_cnd(join_rows(1, c(1, 1), multiple = "warning"), classes = "warning") - expect_snapshot(cat(conditionMessage(cnd))) +test_that("join_rows() gives meaningful many-to-many warnings", { + expect_snapshot({ + join_rows(c(1, 1), c(1, 1)) + }) - df1 <- data.frame(x = 1) - df2 <- data.frame(x = c(1, 1)) - expect_snapshot(. <- left_join(df1, df2, by = "x")) + # With proof that the defaults flow through user facing functions + df <- data.frame(x = c(1, 1)) + expect_snapshot({ + left_join(df, df, by = join_by(x)) + }) }) test_that("join_rows() gives meaningful error message on unmatched rows", { @@ -365,3 +393,63 @@ test_that("join_rows() validates `unmatched`", { join_rows(df, df, type = "inner", unmatched = c("drop", "dr")) }) }) + +test_that("join_rows() validates `relationship`", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + join_rows(df, df, relationship = 1) + }) + + # Notably can't use the vctrs options + expect_snapshot(error = TRUE, { + join_rows(df, df, relationship = "none") + }) + expect_snapshot(error = TRUE, { + join_rows(df, df, relationship = "warn-many-to-many") + }) +}) + +# Deprecated behavior ---------------------------------------------------------- + +test_that("`multiple = NULL` is deprecated and results in `'all'` (#6731)", { + df1 <- tibble(x = c(1, 2)) + df2 <- tibble(x = c(2, 1, 2)) + + expect_snapshot({ + out <- join_rows(df1, df2, multiple = NULL) + }) + expect_identical(out$x, c(1L, 2L, 2L)) + expect_identical(out$y, c(2L, 1L, 3L)) + + expect_snapshot({ + left_join(df1, df2, by = join_by(x), multiple = NULL) + }) +}) + +test_that("`multiple = 'error'` is deprecated (#6731)", { + df1 <- tibble(x = c(1, 2)) + df2 <- tibble(x = c(2, 1, 2)) + + expect_snapshot(error = TRUE, { + join_rows(df1, df2, multiple = "error") + }) + expect_snapshot(error = TRUE, { + left_join(df1, df2, by = join_by(x), multiple = "error") + }) +}) + +test_that("`multiple = 'warning'` is deprecated (#6731)", { + df1 <- tibble(x = c(1, 2)) + df2 <- tibble(x = c(2, 1, 2)) + + expect_snapshot({ + out <- join_rows(df1, df2, multiple = "warning") + }) + expect_identical(out$x, c(1L, 2L, 2L)) + expect_identical(out$y, c(2L, 1L, 3L)) + + expect_snapshot({ + left_join(df1, df2, by = join_by(x), multiple = "warning") + }) +}) diff --git a/tests/testthat/test-join.R b/tests/testthat/test-join.R index d96bab5a29..b09ea2788b 100644 --- a/tests/testthat/test-join.R +++ b/tests/testthat/test-join.R @@ -257,30 +257,27 @@ test_that("join_filter() validates arguments", { }) }) -test_that("mutating joins trigger multiple match warning", { - df1 <- tibble(x = 1) - df2 <- tibble(x = c(1, 1)) - expect_snapshot(out <- left_join(df1, df2, join_by(x))) +test_that("mutating joins trigger many-to-many warning", { + df <- tibble(x = c(1, 1)) + expect_snapshot(out <- left_join(df, df, join_by(x))) }) -test_that("mutating joins don't trigger multiple match warning when called indirectly", { - df1 <- tibble(x = 1) - df2 <- tibble(x = c(1, 1)) +test_that("mutating joins don't trigger many-to-many warning when called indirectly", { + df <- tibble(x = c(1, 1)) - fn <- function(df1, df2, multiple = NULL) { - left_join(df1, df2, join_by(x), multiple = multiple) + fn <- function(df1, df2, relationship = NULL) { + left_join(df1, df2, join_by(x), relationship = relationship) } # Directly calling `left_join()` from a function you control results in a warning - expect_warning(fn(df1, df2), class = "dplyr_warning_join_matches_multiple") + expect_warning(fn(df, df), class = "dplyr_warning_join_relationship_many_to_many") # Now mimic calling an "rlang function" which you don't control that calls `left_join()` fn_env(fn) <- ns_env("rlang") # Indirectly calling `left_join()` through a function you don't control - # doesn't warn unless `multiple = "warning"` is explicitly set - expect_no_warning(fn(df1, df2), class = "dplyr_warning_join_matches_multiple") - expect_warning(fn(df1, df2, "warning"), class = "dplyr_warning_join_matches_multiple") + # doesn't warn + expect_no_warning(fn(df, df), class = "dplyr_warning_join_relationship_many_to_many") }) test_that("mutating joins compute common columns", { @@ -437,7 +434,7 @@ test_that("joins preserve groups", { gf1 <- tibble(a = 1:3) %>% group_by(a) gf2 <- tibble(a = rep(1:4, 2), b = 1) %>% group_by(b) - i <- count_regroups(out <- inner_join(gf1, gf2, by = "a", multiple = "all")) + i <- count_regroups(out <- inner_join(gf1, gf2, by = "a")) expect_equal(i, 1L) expect_equal(group_vars(out), "a") @@ -459,11 +456,11 @@ test_that("joins respect zero length groups", { df2 <- tibble(f = factor( c(2,2,3,3), levels = 1:3), y = c(1,2,3,4)) %>% group_by(f) - expect_equal(group_size(left_join( df1, df2, by = "f", multiple = "all")), c(2,4)) - expect_equal(group_size(right_join( df1, df2, by = "f", multiple = "all")), c(4,2)) - expect_equal(group_size(full_join( df1, df2, by = "f", multiple = "all")), c(2,4,2)) + expect_equal(group_size(left_join( df1, df2, by = "f", relationship = "many-to-many")), c(2,4)) + expect_equal(group_size(right_join( df1, df2, by = "f", relationship = "many-to-many")), c(4,2)) + expect_equal(group_size(full_join( df1, df2, by = "f", relationship = "many-to-many")), c(2,4,2)) expect_equal(group_size(anti_join( df1, df2, by = "f")), c(2)) - expect_equal(group_size(inner_join( df1, df2, by = "f", multiple = "all")), c(4)) + expect_equal(group_size(inner_join( df1, df2, by = "f", relationship = "many-to-many")), c(4)) df1 <- tibble(f = factor( c(1,1,2,2), levels = 1:3), x = c(1,2,1,4)) %>% @@ -471,11 +468,11 @@ test_that("joins respect zero length groups", { df2 <- tibble(f = factor( c(2,2,3,3), levels = 1:3), y = c(1,2,3,4)) %>% group_by(f, .drop = FALSE) - expect_equal(group_size(left_join( df1, df2, by = "f", multiple = "all")), c(2,4,0)) - expect_equal(group_size(right_join( df1, df2, by = "f", multiple = "all")), c(0,4,2)) - expect_equal(group_size(full_join( df1, df2, by = "f", multiple = "all")), c(2,4,2)) + expect_equal(group_size(left_join( df1, df2, by = "f", relationship = "many-to-many")), c(2,4,0)) + expect_equal(group_size(right_join( df1, df2, by = "f", relationship = "many-to-many")), c(0,4,2)) + expect_equal(group_size(full_join( df1, df2, by = "f", relationship = "many-to-many")), c(2,4,2)) expect_equal(group_size(anti_join( df1, df2, by = "f")), c(2,0,0)) - expect_equal(group_size(inner_join( df1, df2, by = "f", multiple = "all")), c(0,4,0)) + expect_equal(group_size(inner_join( df1, df2, by = "f", relationship = "many-to-many")), c(0,4,0)) }) test_that("group column names reflect renamed duplicate columns (#2330)", { @@ -492,7 +489,7 @@ test_that("rowwise group structure is updated after a join (#5227)", { df1 <- rowwise(tibble(x = 1:2)) df2 <- tibble(x = c(1:2, 2L)) - x <- left_join(df1, df2, by = "x", multiple = "all") + x <- left_join(df1, df2, by = "x") expect_identical(group_rows(x), list_of(1L, 2L, 3L)) }) @@ -521,13 +518,13 @@ test_that("`by = character()` technically respects `unmatched`", { }) }) -test_that("`by = character()` technically respects `multiple`", { +test_that("`by = character()` technically respects `relationship`", { local_options(lifecycle_verbosity = "quiet") df <- tibble(x = 1:2) expect_snapshot(error = TRUE, { - left_join(df, df, by = character(), multiple = "error") + left_join(df, df, by = character(), relationship = "many-to-one") }) })