Skip to content

Commit

Permalink
Implement symdiff()
Browse files Browse the repository at this point in the history
Fixes #4811
  • Loading branch information
hadley committed Aug 3, 2022
1 parent 0dd7eea commit 7eecd88
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 39 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ S3method(summarise_bullets,"dplyr:::summarise_incompatible_size")
S3method(summarise_bullets,"dplyr:::summarise_mixed_null")
S3method(summarise_bullets,"dplyr:::summarise_unsupported_type")
S3method(summarise_bullets,default)
S3method(symdiff,data.frame)
S3method(symdiff,default)
S3method(tally,data.frame)
S3method(tbl,DBIConnection)
S3method(tbl,src_local)
Expand Down Expand Up @@ -440,6 +442,7 @@ export(summarize_each)
export(summarize_each_)
export(summarize_if)
export(sym)
export(symdiff)
export(syms)
export(tally)
export(tally_)
Expand Down
4 changes: 1 addition & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# dplyr (development version)

* Passing `...` to `across()` is now deprecated because the evaluation timing of
`...` is ambiguous. Now instead of (e.g.) `across(a:b, mean, na.rm = TRUE)`
you should write `across(a:b, ~ mean(.x, na.rm = TRUE))` (#6073).
* New `symdiff()` function computes the symmetric difference (#4811).

* Rowwise-`mutate()` behaves a little better with 0-row inputs (#6303).

Expand Down
65 changes: 47 additions & 18 deletions R/sets.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,47 @@
#' * `union(x, y)` finds all rows in either `x` or `y`, excluding duplicates.
#' * `union_all(x, y)` finds all rows in either `x` or `y`, including duplicates.
#' * `setdiff(x, y)` finds all rows in `x` that aren't in `y`.
#' * `symdiff(x, y)` computes the symetric difference, i.e. all rows in
#' `x` that aren't in `y` and all rows in `y` that aren't in `x`.
#' * `setequal(x, y)` returns `TRUE` if `x` and `y` contain the same rows
#' (ignoring order).
#'
#' Note that `intersect()`, `union()` and `setdiff()` remove duplicates
#' in `x` and `y`.
#' Note that `intersect()`, `union()`, `setdiff()` and `symdiff()` remove
#' duplicates in `x` and `y`.
#'
#' # Base functions
#' `intersect()`, `union()`, `setdiff()`, and `setequal()` override the base
#' functions of the same name in order to make them generic. The existing
#' behaviour for vectors is preserved by providing default methods that call
#' the base functions.
#'
#' @param x,y Pair of data frames.
#' @param x,y Pair of compatible data frames. A pair of data frames is
#' compatible if they have the same column names (possibly in different
#' orders) and compatible types.
#' @inheritParams rlang::args_dots_empty
#' @name setops
#' @examples
#' mtcars$model <- rownames(mtcars)
#' first <- mtcars[1:20, ]
#' second <- mtcars[10:32, ]
#' df1 <- tibble(x = 1:3)
#' df2 <- tibble(x = 3:5)
#'
#' intersect(first, second)
#' union(first, second)
#' setdiff(first, second)
#' setdiff(second, first)
#' intersect(df1, df2)
#' union(df1, df2)
#' union_all(df1, df2)
#' setdiff(df1, df2)
#' setdiff(df2, df1)
#' symdiff(df1, df2)
#'
#' union_all(first, second)
#' setequal(mtcars, mtcars[32:1, ])
#' setequal(df1, df2)
#' setequal(df1, df1[3:1, ])
#'
#' # Note the following 3 functions also remove pre-existing duplicates in `x` or `y`:
#' a <- data.frame(x = c(1:3, 3, 3))
#' b <- data.frame(x = c(3:5, 5))
#' # Note that the following functions remove pre-existing duplicates:
#' df1 <- tibble(x = c(1:3, 3, 3))
#' df2 <- tibble(x = c(3:5, 5))
#'
#' intersect(a, b)
#' union(a, b)
#' setdiff(a, b)
#' intersect(df1, df2)
#' union(df1, df2)
#' setdiff(df1, df2)
#' symdiff(df1, df2)
NULL

#' @name setops
Expand Down Expand Up @@ -82,6 +88,17 @@ NULL
#' @export setequal
NULL

#' @rdname setops
#' @export
symdiff <- function(x, y, ...) {
UseMethod("symdiff")
}
#' @export
symdiff.default <- function (x, y, ...) {
check_dots_empty()
setdiff(union(x, y), intersect(x, y))
}

#' @export
intersect.data.frame <- function(x, y, ...) {
check_dots_empty()
Expand Down Expand Up @@ -134,6 +151,18 @@ setequal.data.frame <- function(x, y, ...) {
all(vec_in(cast$x, cast$y)) && all(vec_in(cast$y, cast$x))
}

#' @export
symdiff.data.frame <- function(x, y, ...) {
check_dots_empty()
check_compatible(x, y)

cast <- vec_cast_common(x = x, y = y)
only_x <- vec_slice(cast$x, !vec_in(cast$x, cast$y))
only_y <- vec_slice(cast$y, !vec_in(cast$y, cast$x))

out <- vec_unique(vec_rbind(only_x, only_y))
dplyr_reconstruct(out, x)
}

# Helpers -----------------------------------------------------------------

Expand Down
45 changes: 27 additions & 18 deletions man/setops.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions tests/testthat/_snaps/sets.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
! `...` must be empty.
x Problematic argument:
* z = 3
Code
symdiff(df1, df2, z = 3)
Condition
Error in `symdiff()`:
! `...` must be empty.
x Problematic argument:
* z = 3

# incompatible data frames error (#903)

Expand Down Expand Up @@ -55,6 +62,12 @@
Error in `setdiff()`:
! `x` and `y` are not compatible.
x Different number of columns: 1 vs 2.
Code
symdiff(df1, df2)
Condition
Error in `symdiff()`:
! `x` and `y` are not compatible.
x Different number of columns: 1 vs 2.

# is_compatible generates useful messages for different cases

Expand Down
11 changes: 11 additions & 0 deletions tests/testthat/test-sets.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ test_that("x used as basis of output (#3839)", {
expect_equal(union(df1, df2), tibble(x = 1:4, y = 1))
expect_equal(union_all(df1, df2), tibble(x = c(1:4, 4, 2), y = 1))
expect_equal(setdiff(df1, df2), tibble(x = c(1, 3), y = 1))
expect_equal(symdiff(df1, df2), tibble(x = c(1, 3), y = 1))
})

test_that("set operations (apart from union_all) remove duplicates", {
Expand All @@ -16,7 +17,9 @@ test_that("set operations (apart from union_all) remove duplicates", {
expect_equal(union(df1, df2), tibble(x = c(1, 2)))
expect_equal(union_all(df1, df2), tibble(x = c(1, 1, 2, 2)))
expect_equal(setdiff(df1, df2), tibble(x = 1))
expect_equal(symdiff(df1, df2), tibble(x = 1))
})

test_that("standard coercion rules are used (#799)", {
df1 <- tibble(x = 1:2, y = c(1, 1))
df2 <- tibble(x = 1:2, y = 1:2)
Expand All @@ -25,6 +28,7 @@ test_that("standard coercion rules are used (#799)", {
expect_equal(nrow(union(df1, df2)), 3)
expect_equal(nrow(union_all(df1, df2)), 4)
expect_equal(nrow(setdiff(df1, df2)), 1)
expect_equal(nrow(symdiff(df1, df2)), 2)
})

test_that("grouping metadata is reconstructed (#3587)", {
Expand All @@ -35,13 +39,17 @@ test_that("grouping metadata is reconstructed (#3587)", {
expect_equal(group_vars(union(df1, df2)), "g")
expect_equal(group_vars(union_all(df1, df2)), "g")
expect_equal(group_vars(setdiff(df1, df2)), "g")
expect_equal(group_vars(symdiff(df1, df2)), "g")
})

test_that("also work with vectors", {
expect_equal(intersect(1:3, 3:4), 3)
expect_equal(union(1:3, 3:4), 1:4)
expect_equal(union_all(1:3, 3:4), c(1:3, 3:4))
expect_equal(setdiff(1:3, 3:4), 1:2)
expect_equal(symdiff(1:3, 3:4), c(1, 2, 4))
# removes duplicates
expect_equal(symdiff(c(1, 1, 2), c(2, 2, 3)), c(1, 3))
})

test_that("extra arguments in ... error (#5891)", {
Expand All @@ -53,6 +61,7 @@ test_that("extra arguments in ... error (#5891)", {
union(df1, df2, z = 3)
union_all(df1, df2, z = 3)
setdiff(df1, df2, z = 3)
symdiff(df1, df2, z = 3)
})
})

Expand All @@ -65,6 +74,7 @@ test_that("incompatible data frames error (#903)", {
union(df1, df2)
union_all(df1, df2)
setdiff(df1, df2)
symdiff(df1, df2)
})
})

Expand Down Expand Up @@ -118,3 +128,4 @@ test_that("setequal checks y is a data frame", {
test_that("setequal checks for extra arguments", {
expect_snapshot(setequal(mtcars, mtcars, z = 2), error = TRUE)
})

0 comments on commit 7eecd88

Please sign in to comment.