Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cbindlist/mergelist] Helpers for mergepair: onkeys/someCols/hasindex/fdistinct/dtmerge/seqexp/perhaps.data.table #6436

Draft
wants to merge 1 commit into
base: cbindlist
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -519,13 +519,22 @@ replace_dot_alias = function(e) {
if (!byjoin || nqbyjoin) {
# Really, `anyDuplicated` in base is AWESOME!
# allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
irows = if (allLen1) f__ else vecseq(f__,len__,
if (allow.cartesian ||
notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
!anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
NULL # #742. If 'i' has no duplicates, ignore
} else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
irows = if (allLen1) f__ else {
join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility
anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) {
NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
} else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
!allow.cartesian ||
!join.many))
as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_)))
limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore
if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i))
else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
}
vecseq(f__, len__, limit)
} # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
# Fix for #1092 and #1074
# TODO: implement better version of "any"/"all"/"which" to avoid
Expand Down
123 changes: 123 additions & 0 deletions R/mergelist.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,126 @@ cbindlist = function(l, copy=TRUE) {
setDT(ans)
ans
}

# when 'on' is missing then use keys, used only for inner and full join
onkeys = function(x, y) {
if (is.null(x) && !is.null(y)) y
else if (!is.null(x) && is.null(y)) x
else if (!is.null(x) && !is.null(y)) {
if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key
else intersect(x, y)
} else NULL # nocov ## internal error is being called later in mergepair
}
someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
keep = colnamesInt(x, keep)
drop = colnamesInt(x, drop)
cols = colnamesInt(x, cols)
ans = union(keep, setdiff(cols, drop))
if (!retain.order) return(ans)
intersect(colnamesInt(x, NULL), ans)
}
hasindex = function(x, by, retGrp=FALSE) {
index = attr(x, "index", TRUE)
if (is.null(index)) return(FALSE)
idx_name = paste0("__",by,collapse="")
idx = attr(index, idx_name, TRUE)
if (is.null(idx)) return(FALSE)
if (!retGrp) return(TRUE)
return(!is.null(attr(idx, "starts", TRUE)))
}

# fdistinct applies mult='first|last'
# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
# it may not copy when copy=FALSE and x is unique by 'on'
fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) {
if (!perhaps.data.table(x))
stopf("'x' must be data.table")
if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
stopf("'on' must be character column names of 'x' argument")
mult = match.arg(mult)
if (is.null(cols))
cols = seq_along(x)
else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
if (!isTRUEorFALSE(copy))
stopf("'%s' must be TRUE or FALSE", "copy")
## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
## this short circuit will work after #4386 because it requires retGrp=T
#### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
sort = TRUE ## above line does not work for the moment, test 302.02
o = forderv(x, by=on, sort=sort, retGrp=TRUE)
if (attr(o, "maxgrpn", TRUE) <= 1L) {
ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
if (copy) ans = copy(ans)
return(ans)
}
f = attr(o, "starts", exact=TRUE)
if (mult=="last") {
if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
f = c(f[-1L]-1L, nrow(x)) ## last of each group
}
if (length(o)) f = o[f]
if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
.Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
}

# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_)
nomatch0 = identical(nomatch, 0L)
if (is.null(mult))
mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error")
if (void && mult!="error")
internal_error("void must be used with mult='error'") # nocov
if (how=="cross") { ## short-circuit bmerge results only for cross join
if (length(on) || mult!="all" || !join.many)
stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
if (void)
internal_error("cross join must be used with void=FALSE") # nocov
ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
} else {
if (!length(on))
stopf("'on' must be non-zero length character vector")
if (mult=="all" && (how=="semi" || how=="anti"))
stopf("semi and anti joins must be used with mult!='all'")
icols = colnamesInt(i, on, check_dups=TRUE)
xcols = colnamesInt(x, on, check_dups=TRUE)
ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
return(invisible(NULL))
} else if (how=="semi" || how=="anti") { ## semi and anti short-circuit
irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows
if (length(irows)==length(ans$lens)) irows = NULL
return(list(ans=ans, irows=irows))
} else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
!(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_))
)
stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
}

## xrows, join-to
xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here

## irows, join-from
irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens)
len.i = if (is.null(irows)) nrow(i) else length(irows)

if (length(ans$xo) && length(xrows))
xrows = ans$xo[xrows]
len.x = length(xrows)

if (len.i!=len.x)
internal_error("dtmerge out len.i != len.x") # nocov

return(list(ans=ans, irows=irows, xrows=xrows))
}

# Previously, we had a custom C implementation here, which is ~2x faster,
# but this is fast enough we don't bother maintaining a new routine.
# Hopefully in the future rep() can recognize the ALTREP and use that, too.
seqexp = function(x) rep(seq_along(x), x)
perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)
74 changes: 73 additions & 1 deletion inst/tests/mergelist.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,49 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
} else {
require(data.table)
test = data.table:::test
perhaps.data.table = data.table:::perhaps.data.table
hasindex = data.table:::hasindex
fdistinct = data.table:::fdistinct
forderv = data.table:::forderv
}

addresses = function(x) vapply(x, address, "")

# internal helpers

test(1.01, perhaps.data.table(list()))
test(1.02, perhaps.data.table(list(a=1:2)))
test(1.03, perhaps.data.table(list(a=1:2, b=1:2)))
test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE)

test(2.01, fdistinct(list(x=c(1L,1:2), b=1:2), on="x", mult="last"), error="must be data.table")
test(2.02, fdistinct(data.table(x=c(1L,1:2)), on="z", mult="last"), error="must be character column names of")
test(2.03, fdistinct(data.table(x=c(1L,1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of")
test(2.04, fdistinct(data.table(x=c(1L,1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE")
d = data.table(x=1:2, y=1:2)
test(2.05, ans<-fdistinct(d, on="x", mult="last"), d)
test(2.06, intersect(addresses(ans), addresses(d)), character())
test(2.07, ans<-fdistinct(d, on="x", mult="last", copy=FALSE), d)
test(2.08, addresses(ans), addresses(d))
d = data.table(x=c(2:1,2L), y=1:3)
test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3))
setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386
test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))

test(3.01, hasindex(d, "x"))
test(3.02, hasindex(d, "x", retGrp=TRUE))
setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x")) ## retGrp=F index #4386
test(3.03, hasindex(d, "x"))
test(3.04, !hasindex(d, "x", retGrp=TRUE))
setattr(d, "index", NULL)
test(3.05, !hasindex(d, "x"))
test(3.06, !hasindex(d, "x", retGrp=TRUE))
setattr(d, "index", integer())
test(3.07, !hasindex(d, "x"))
test(3.08, !hasindex(d, "x", retGrp=TRUE))
rm(d)

# cbindlist

l = list(
Expand Down Expand Up @@ -57,7 +96,7 @@ attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")]
test(13.01, class(d), "list")
setDT(d)
test(13.02, key(d), "x")
# test(13.03, hasindex(d, "y") && hasindex(d, "z"))
test(13.03, hasindex(d, "y") && hasindex(d, "z"))
l = list(
data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5),
data.table(id4=5:1, id5=1:5, v2=1:5),
Expand All @@ -70,3 +109,36 @@ ans = cbindlist(l)
test(13.04, key(ans), "id1")
test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9"))
test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib

## fdistinct, another round

dt = data.table(x =
c(74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L,
49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L,
180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L,
52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L,
146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L,
28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L,
132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L,
108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L,
68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L,
167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L,
269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L,
137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L,
10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L,
71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L,
138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L,
114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L,
178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L,
200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L,
88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L,
265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L,
46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L,
174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L,
115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L,
30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L,
18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L)
)
ans = unique(dt, by="x")
test(301.01, data.table(x=unique(dt$x)), ans) ## OK
test(301.02, fdistinct(dt, on="x"), ans) ## force sort=TRUE for the moment
1 change: 1 addition & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = {
{"CconvertDate", (DL_FUNC)&convertDate, -1},
{"Cnotchin", (DL_FUNC)&notchin, -1},
{"Ccbindlist", (DL_FUNC) &cbindlist, -1},
{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1},
{"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
{NULL, NULL, 0}
};
Expand Down
Loading