JuliaData · nalimilan · Feb 10, 2021 · Dec 21, 2020 · Jan 1, 2021 · Jan 21, 2021
diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
@@ -122,11 +122,13 @@ It is allowed to mix single values and vectors if multiple transformations
 are requested. In this case single value will be repeated to match the length
 of columns specified by returned vectors.
 
-A separate task is spawned for each specified transformation, allowing for
-parallel operation when several transformations are requested and Julia was
-started with more than one thread. Passed transformation functions should
-therefore not modify global variables (i.e. they should be pure), or use
-locks to control parallel accesses.
+A separate task is spawned for each specified transformation; each transformation
+then spawns as many tasks as Julia threads, and splits processing of groups across
+them (however, currently transformations with optimized implementations like `sum`
+and transformations that return multiple rows use a single task for all groups).
+This allows for parallel operation when Julia was started with more than one
+thread. Passed transformation functions should therefore not modify global variables
+(i.e. they should be pure), or use locks to control parallel accesses.
 
 To apply `function` to each row instead of whole columns, it can be wrapped in a
 `ByRow` struct. `cols` can be any column indexing syntax, in which case

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
@@ -145,10 +145,13 @@ const TRANSFORMATION_COMMON_RULES =
     `copycols=false`, a `SubDataFrame` is returned without copying columns.
 
     If a `GroupedDataFrame` is passed, a separate task is spawned for each
-    specified transformation, allowing for parallel operation when several
-    transformations are requested and Julia was started with more than one thread.
-    Passed transformation functions should therefore not modify global variables
-    (i.e. they should be pure), or use locks to control parallel accesses.
+    specified transformation; each transformation then spawns as many tasks
+    as Julia threads, and splits processing of groups across them
+    (however, currently transformations with optimized implementations like `sum`
+    and transformations that return multiple rows use a single task for all groups).
+    This allows for parallel operation when Julia was started with more than one
+    thread. Passed transformation functions should therefore not modify global
+    variables (i.e. they should be pure), or use locks to control parallel accesses.
     In the future, parallelism may be extended to other cases, so this requirement
     also holds for `DataFrame` inputs.
     """

diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl
@@ -191,14 +191,17 @@ function _combine_rows_with_first!(first::Union{NamedTuple, DataFrameRow},
     # Create up to one task per thread
     # This has lower overhead than creating one task per group,
     # but is optimal only if operations take roughly the same time for all groups
-    basesize = max(1, (len - 1) ÷ Threads.nthreads())
-    partitions = collect(_partition(2:len, basesize))
+    @static if VERSION >= v"1.4"
+        basesize = max(1, (len - 1) ÷ Threads.nthreads())
+        partitions = Iterators.partition(2:len, basesize)
+    else
+        partitions = (2:len,)
+    end
     widen_type_lock = ReentrantLock()
     outcolsref = Ref{NTuple{<:Any, AbstractVector}}(outcols)
     type_widened = fill(false, length(partitions))
-    tasks = similar(partitions, Task)
-    for tid in 1:length(partitions)
-        idx = partitions[tid]
+    tasks = Vector{Task}(undef, length(partitions))
+    for (tid, idx) in enumerate(partitions)
         tasks[tid] =
             @spawn _combine_rows_with_first_task!(tid, idx, outcols, outcolsref,
                                                   type_widened, widen_type_lock,
@@ -219,9 +222,8 @@ function _combine_rows_with_first!(first::Union{NamedTuple, DataFrameRow},
 
     # Copy data for any tasks that finished before others widened columns
     outcols = outcolsref[]
-    for tid in 1:length(partitions)
+    for (tid, idx) in enumerate(partitions)
         if type_widened[tid]
-            idx = partitions[tid]
             oldoutcols = fetch(tasks[tid])
             for k in 1:length(outcols)
                 if oldoutcols[k] !== outcols[k]