Rdatatable · MichaelChirico · Aug 29, 2024
@@ -519,13 +519,22 @@ replace_dot_alias = function(e) {
         if (!byjoin || nqbyjoin) {
           # Really, `anyDuplicated` in base is AWESOME!
           # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
-          if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
-          irows = if (allLen1) f__ else vecseq(f__,len__,
-            if (allow.cartesian ||
-                notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
-                !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
-              NULL # #742. If 'i' has no duplicates, ignore
-            } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
+          if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
+          irows = if (allLen1) f__ else {
+            join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility
+            anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) {
+              NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+            } else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
+              !allow.cartesian ||
+              !join.many))
+              as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_)))
+            limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore
+              if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+              else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i))
+              else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
+            }
+            vecseq(f__, len__, limit)
+          } # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
           if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
           # Fix for #1092 and #1074
           # TODO: implement better version of "any"/"all"/"which" to avoid

@@ -7,3 +7,126 @@ cbindlist = function(l, copy=TRUE) {
   setDT(ans)
   ans
 }
+
+# when 'on' is missing then use keys, used only for inner and full join
+onkeys = function(x, y) {
+  if (is.null(x) && !is.null(y)) y
+  else if (!is.null(x) && is.null(y)) x
+  else if (!is.null(x) && !is.null(y)) {
+    if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key
+    else intersect(x, y)
+  } else NULL # nocov ## internal error is being called later in mergepair
+}
+someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
+  keep = colnamesInt(x, keep)
+  drop = colnamesInt(x, drop)
+  cols = colnamesInt(x, cols)
+  ans = union(keep, setdiff(cols, drop))
+  if (!retain.order) return(ans)
+  intersect(colnamesInt(x, NULL), ans)
+}
+hasindex = function(x, by, retGrp=FALSE) {
+  index = attr(x, "index", TRUE)
+  if (is.null(index)) return(FALSE)
+  idx_name = paste0("__",by,collapse="")
+  idx = attr(index, idx_name, TRUE)
+  if (is.null(idx)) return(FALSE)
+  if (!retGrp) return(TRUE)
+  return(!is.null(attr(idx, "starts", TRUE)))
+}
+
+# fdistinct applies mult='first|last'
+# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
+# it may not copy when copy=FALSE and x is unique by 'on'
+fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) {
+  if (!perhaps.data.table(x))
+    stopf("'x' must be data.table")
+  if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
+    stopf("'on' must be character column names of 'x' argument")
+  mult = match.arg(mult)
+  if (is.null(cols))
+    cols = seq_along(x)
+  else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
+    stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
+  if (!isTRUEorFALSE(copy))
+    stopf("'%s' must be TRUE or FALSE", "copy")
+  ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
+  ## this short circuit will work after #4386 because it requires retGrp=T
+  #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
+  sort = TRUE ## above line does not work for the moment, test 302.02
+  o = forderv(x, by=on, sort=sort, retGrp=TRUE)
+  if (attr(o, "maxgrpn", TRUE) <= 1L) {
+    ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
+    if (copy) ans = copy(ans)
+    return(ans)
+  }
+  f = attr(o, "starts", exact=TRUE)
+  if (mult=="last") {
+    if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
+    f = c(f[-1L]-1L, nrow(x)) ## last of each group
+  }
+  if (length(o)) f = o[f]
+  if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
+  .Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
+}
+
+# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
+# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
+dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
+  nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_)
+  nomatch0 = identical(nomatch, 0L)
+  if (is.null(mult))
+    mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error")
+  if (void && mult!="error")
+    internal_error("void must be used with mult='error'") # nocov
+  if (how=="cross") { ## short-circuit bmerge results only for cross join
+    if (length(on) || mult!="all" || !join.many)
+      stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
+    if (void)
+      internal_error("cross join must be used with void=FALSE") # nocov
+    ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
+  } else {
+    if (!length(on))
+      stopf("'on' must be non-zero length character vector")
+    if (mult=="all" && (how=="semi" || how=="anti"))
+      stopf("semi and anti joins must be used with mult!='all'")
+    icols = colnamesInt(i, on, check_dups=TRUE)
+    xcols = colnamesInt(x, on, check_dups=TRUE)
+    ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
+    if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
+      return(invisible(NULL))
+    } else if (how=="semi" || how=="anti") { ## semi and anti short-circuit
+      irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows
+      if (length(irows)==length(ans$lens)) irows = NULL
+      return(list(ans=ans, irows=irows))
+    } else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
+      !(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+      anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_))
+    )
+      stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+  }
+
+  ## xrows, join-to
+  xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
+  if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
+  len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here
+
+  ## irows, join-from
+  irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens)
+  len.i = if (is.null(irows)) nrow(i) else length(irows)
+
+  if (length(ans$xo) && length(xrows))
+    xrows = ans$xo[xrows]
+  len.x = length(xrows)
+
+  if (len.i!=len.x)
+    internal_error("dtmerge out len.i != len.x") # nocov
+
+  return(list(ans=ans, irows=irows, xrows=xrows))
+}
+
+# Previously, we had a custom C implementation here, which is ~2x faster,
+#   but this is fast enough we don't bother maintaining a new routine.
+#   Hopefully in the future rep() can recognize the ALTREP and use that, too.
+seqexp = function(x) rep(seq_along(x), x)
+perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)
@@ -6,10 +6,49 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 } else {
   require(data.table)
   test = data.table:::test
+  perhaps.data.table = data.table:::perhaps.data.table
+  hasindex = data.table:::hasindex
+  fdistinct = data.table:::fdistinct
+  forderv = data.table:::forderv
 }
 
 addresses = function(x) vapply(x, address, "")
 
+# internal helpers
+
+test(1.01, perhaps.data.table(list()))
+test(1.02, perhaps.data.table(list(a=1:2)))
+test(1.03, perhaps.data.table(list(a=1:2, b=1:2)))
+test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE)
+
+test(2.01, fdistinct(list(x=c(1L,1:2), b=1:2), on="x", mult="last"), error="must be data.table")
+test(2.02, fdistinct(data.table(x=c(1L,1:2)), on="z", mult="last"), error="must be character column names of")
+test(2.03, fdistinct(data.table(x=c(1L,1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of")
+test(2.04, fdistinct(data.table(x=c(1L,1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE")
+d = data.table(x=1:2, y=1:2)
+test(2.05, ans<-fdistinct(d, on="x", mult="last"), d)
+test(2.06, intersect(addresses(ans), addresses(d)), character())
+test(2.07, ans<-fdistinct(d, on="x", mult="last", copy=FALSE), d)
+test(2.08, addresses(ans), addresses(d))
+d = data.table(x=c(2:1,2L), y=1:3)
+test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
+test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3))
+setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386
+test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
+
+test(3.01, hasindex(d, "x"))
+test(3.02, hasindex(d, "x", retGrp=TRUE))
+setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x"))              ## retGrp=F index #4386
+test(3.03, hasindex(d, "x"))
+test(3.04, !hasindex(d, "x", retGrp=TRUE))
+setattr(d, "index", NULL)
+test(3.05, !hasindex(d, "x"))
+test(3.06, !hasindex(d, "x", retGrp=TRUE))
+setattr(d, "index", integer())
+test(3.07, !hasindex(d, "x"))
+test(3.08, !hasindex(d, "x", retGrp=TRUE))
+rm(d)
+
 # cbindlist
 
 l = list(
@@ -57,7 +96,7 @@ attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")]
 test(13.01, class(d), "list")
 setDT(d)
 test(13.02, key(d), "x")
-# test(13.03, hasindex(d, "y") && hasindex(d, "z"))
+test(13.03, hasindex(d, "y") && hasindex(d, "z"))
 l = list(
   data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5),
   data.table(id4=5:1, id5=1:5, v2=1:5),
@@ -70,3 +109,36 @@ ans = cbindlist(l)
 test(13.04, key(ans), "id1")
 test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9"))
 test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib
+
+## fdistinct, another round
+
+dt = data.table(x =
+c(74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L,
+  49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L,
+  180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L,
+  52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L,
+  146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L,
+  28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L,
+  132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L,
+  108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L,
+  68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L,
+  167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L,
+  269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L,
+  137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L,
+  10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L,
+  71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L,
+  138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L,
+  114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L,
+  178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L,
+  200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L,
+  88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L,
+  265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L,
+  46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L,
+  174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L,
+  115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L,
+  30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L,
+  18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L)
+)
+ans = unique(dt, by="x")
+test(301.01, data.table(x=unique(dt$x)), ans) ## OK
+test(301.02, fdistinct(dt, on="x"), ans)      ## force sort=TRUE for the moment
@@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = {
 {"CconvertDate", (DL_FUNC)&convertDate, -1},
 {"Cnotchin", (DL_FUNC)&notchin, -1},
 {"Ccbindlist", (DL_FUNC) &cbindlist, -1},
+{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1},
 {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
 {NULL, NULL, 0}
 };