From 6278b8e3bf5efef07bdd4826b11fec536f271f19 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Fri, 26 Jan 2018 14:22:20 -0800
Subject: [PATCH 01/15] WIP: create Accumulate iterator

This makes it possible to use the `collect` machinery for determining output types.

Also moves code out of multidimensional.jl (which is a bit of an odd place for it).

Still need to get slicing working.
---
 base/accumulate.jl       | 450 +++++++++++++++++++++++++++++++++++++++
 base/multidimensional.jl | 396 ----------------------------------
 base/sysimg.jl           |   1 +
 3 files changed, 451 insertions(+), 396 deletions(-)
 create mode 100644 base/accumulate.jl

diff --git a/base/accumulate.jl b/base/accumulate.jl
new file mode 100644
index 0000000000000..57284f6796813
--- /dev/null
+++ b/base/accumulate.jl
@@ -0,0 +1,450 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+"""
+    Accumulate(op[, v0], iter)
+
+An iterator which cumulatively applies the binary operation `op` to the iterator `iter`, in a similar manner
+to [`foldl`](@ref) but returning intermediate values.
+
+If an initial `v0` value is provided, then the first value of the iterator will be
+
+    op(v0, first(iter))
+
+Otherwise it will be
+
+    Base.reduce_first(op, first(iter))
+
+See also [`Base.reduce_first`](@ref) and [`accumulate`](@ref).
+"""
+struct Accumulate{O,V,I}
+    op::O
+    v0::V
+    iter::I
+end
+Accumulate(op, iter) = Accumulate(op, uninitialized, iter) # use `uninitialized` as a sentinel
+
+# the following is largely based on Generator objects
+Base.IteratorEltype(acc::Accumulate) = EltypeUnknown()
+Base.IteratorSize(acc::Accumulate) = Base.IteratorSize(acc.iter)
+
+length(acc::Accumulate) = length(acc.iter)
+size(acc::Accumulate)   = size(acc.iter)
+axes(acc::Accumulate)   = axes(acc.iter)
+ndims(acc::Accumulate)  = ndims(acc.iter)
+
+start(acc::Accumulate) = (@_inline_meta; (start(acc.iter), acc.v0))
+done(acc::Accumulate, accstate) = (@_inline_meta; done(acc.iter, accstate[1]))
+function next(acc::Accumulate, accstate)
+    @_inline_meta
+    itrstate, accval = accstate
+    val, itrstate = next(acc.iter, itrstate)
+    if accval === uninitialized
+        accval = reduce_first(acc.op, val)
+        return accval, (itrstate, accval)
+    else
+        accval = acc.op(accval, val)
+        return accval, (itrstate, accval)
+    end
+end
+
+accumulate(op, itr) = collect(Accumulate(op, itr))
+accumulate(op, v0, itr) = collect(Accumulate(op, v0, itr))
+
+
+# TODO
+# below was copied directly from old code in multidimensional.jl
+
+# see discussion in #18364 ... we try not to widen type of the resulting array
+# from cumsum or cumprod, but in some cases (+, Bool) we may not have a choice.
+rcum_promote_type(op, ::Type{T}, ::Type{S}) where {T,S<:Number} = promote_op(op, T, S)
+rcum_promote_type(op, ::Type{T}) where {T<:Number} = rcum_promote_type(op, T,T)
+rcum_promote_type(op, ::Type{T}) where {T} = T
+
+# handle sums of Vector{Bool} and similar.   it would be nice to handle
+# any AbstractArray here, but it's not clear how that would be possible
+rcum_promote_type(op, ::Type{Array{T,N}}) where {T,N} = Array{rcum_promote_type(op,T), N}
+
+# accumulate_pairwise slightly slower then accumulate, but more numerically
+# stable in certain situations (e.g. sums).
+# it does double the number of operations compared to accumulate,
+# though for cheap operations like + this does not have much impact (20%)
+function _accumulate_pairwise!(op::Op, c::AbstractVector{T}, v::AbstractVector, s, i1, n)::T where {T,Op}
+    @inbounds if n < 128
+        s_ = v[i1]
+        c[i1] = op(s, s_)
+        for i = i1+1:i1+n-1
+            s_ = op(s_, v[i])
+            c[i] = op(s, s_)
+        end
+    else
+        n2 = n >> 1
+        s_ = _accumulate_pairwise!(op, c, v, s, i1, n2)
+        s_ = op(s_, _accumulate_pairwise!(op, c, v, op(s, s_), i1+n2, n-n2))
+    end
+    return s_
+end
+
+function accumulate_pairwise!(op::Op, result::AbstractVector, v::AbstractVector) where Op
+    li = linearindices(v)
+    li != linearindices(result) && throw(DimensionMismatch("input and output array sizes and indices must match"))
+    n = length(li)
+    n == 0 && return result
+    i1 = first(li)
+    @inbounds result[i1] = v1 = v[i1]
+    n == 1 && return result
+    _accumulate_pairwise!(op, result, v, v1, i1+1, n-1)
+    return result
+end
+
+function accumulate_pairwise(op, v::AbstractVector{T}) where T
+    out = similar(v, rcum_promote_type(op, T))
+    return accumulate_pairwise!(op, out, v)
+end
+
+function cumsum!(out, v::AbstractVector, dim::Integer)
+    # we dispatch on the possibility of numerical stability issues
+    _cumsum!(out, v, dim, ArithmeticStyle(eltype(out)))
+end
+
+function _cumsum!(out, v, dim, ::ArithmeticRounds)
+    dim == 1 ? accumulate_pairwise!(+, out, v) : copyto!(out, v)
+end
+function _cumsum!(out, v, dim, ::ArithmeticUnknown)
+    _cumsum!(out, v, dim, ArithmeticRounds())
+end
+function _cumsum!(out, v, dim, ::ArithmeticStyle)
+    dim == 1 ? accumulate!(+, out, v) : copyto!(out, v)
+end
+
+"""
+    cumsum(A, dim::Integer)
+
+Cumulative sum along the dimension `dim`. See also [`cumsum!`](@ref)
+to use a preallocated output array, both for performance and to control the precision of the
+output (e.g. to avoid overflow).
+
+# Examples
+```jldoctest
+julia> a = [1 2 3; 4 5 6]
+2×3 Array{Int64,2}:
+ 1  2  3
+ 4  5  6
+
+julia> cumsum(a,1)
+2×3 Array{Int64,2}:
+ 1  2  3
+ 5  7  9
+
+julia> cumsum(a,2)
+2×3 Array{Int64,2}:
+ 1  3   6
+ 4  9  15
+```
+"""
+function cumsum(A::AbstractArray{T}, dim::Integer) where T
+    out = similar(A, rcum_promote_type(+, T))
+    cumsum!(out, A, dim)
+end
+
+"""
+    cumsum(x::AbstractVector)
+
+Cumulative sum a vector. See also [`cumsum!`](@ref)
+to use a preallocated output array, both for performance and to control the precision of the
+output (e.g. to avoid overflow).
+
+# Examples
+```jldoctest
+julia> cumsum([1, 1, 1])
+3-element Array{Int64,1}:
+ 1
+ 2
+ 3
+
+julia> cumsum([fill(1, 2) for i in 1:3])
+3-element Array{Array{Int64,1},1}:
+ [1, 1]
+ [2, 2]
+ [3, 3]
+```
+"""
+cumsum(x::AbstractVector) = cumsum(x, 1)
+
+"""
+    cumsum!(B, A, dim::Integer)
+
+Cumulative sum of `A` along the dimension `dim`, storing the result in `B`. See also [`cumsum`](@ref).
+"""
+cumsum!(B, A, dim::Integer) = accumulate!(+, B, A, dim)
+
+"""
+    cumsum!(y::AbstractVector, x::AbstractVector)
+
+Cumulative sum of a vector `x`, storing the result in `y`. See also [`cumsum`](@ref).
+"""
+cumsum!(y::AbstractVector, x::AbstractVector) = cumsum!(y, x, 1)
+
+"""
+    cumprod(A, dim::Integer)
+
+Cumulative product along the dimension `dim`. See also
+[`cumprod!`](@ref) to use a preallocated output array, both for performance and
+to control the precision of the output (e.g. to avoid overflow).
+
+# Examples
+```jldoctest
+julia> a = [1 2 3; 4 5 6]
+2×3 Array{Int64,2}:
+ 1  2  3
+ 4  5  6
+
+julia> cumprod(a,1)
+2×3 Array{Int64,2}:
+ 1   2   3
+ 4  10  18
+
+julia> cumprod(a,2)
+2×3 Array{Int64,2}:
+ 1   2    6
+ 4  20  120
+```
+"""
+cumprod(A::AbstractArray, dim::Integer) = accumulate(*, A, dim)
+
+"""
+    cumprod(x::AbstractVector)
+
+Cumulative product of a vector. See also
+[`cumprod!`](@ref) to use a preallocated output array, both for performance and
+to control the precision of the output (e.g. to avoid overflow).
+
+# Examples
+```jldoctest
+julia> cumprod(fill(1//2, 3))
+3-element Array{Rational{Int64},1}:
+ 1//2
+ 1//4
+ 1//8
+
+julia> cumprod([fill(1//3, 2, 2) for i in 1:3])
+3-element Array{Array{Rational{Int64},2},1}:
+ Rational{Int64}[1//3 1//3; 1//3 1//3]
+ Rational{Int64}[2//9 2//9; 2//9 2//9]
+ Rational{Int64}[4//27 4//27; 4//27 4//27]
+```
+"""
+cumprod(x::AbstractVector) = cumprod(x, 1)
+
+"""
+    cumprod!(B, A, dim::Integer)
+
+Cumulative product of `A` along the dimension `dim`, storing the result in `B`.
+See also [`cumprod`](@ref).
+"""
+cumprod!(B, A, dim::Integer) = accumulate!(*, B, A, dim)
+
+"""
+    cumprod!(y::AbstractVector, x::AbstractVector)
+
+Cumulative product of a vector `x`, storing the result in `y`.
+See also [`cumprod`](@ref).
+"""
+cumprod!(y::AbstractVector, x::AbstractVector) = cumprod!(y, x, 1)
+
+"""
+    accumulate(op, A, dim::Integer)
+
+Cumulative operation `op` along the dimension `dim`. See also
+[`accumulate!`](@ref) to use a preallocated output array, both for performance and
+to control the precision of the output (e.g. to avoid overflow). For common operations
+there are specialized variants of `accumulate`, see:
+[`cumsum`](@ref), [`cumprod`](@ref)
+
+# Examples
+```jldoctest
+julia> accumulate(+, fill(1, 3, 3), 1)
+3×3 Array{Int64,2}:
+ 1  1  1
+ 2  2  2
+ 3  3  3
+
+julia> accumulate(+, fill(1, 3, 3), 2)
+3×3 Array{Int64,2}:
+ 1  2  3
+ 1  2  3
+ 1  2  3
+```
+"""
+function accumulate(op, A, dim::Integer)
+    out = similar(A, rcum_promote_type(op, eltype(A)))
+    accumulate!(op, out, A, dim)
+end
+
+"""
+    accumulate(op, x::AbstractVector)
+
+Cumulative operation `op` on a vector. See also
+[`accumulate!`](@ref) to use a preallocated output array, both for performance and
+to control the precision of the output (e.g. to avoid overflow). For common operations
+there are specialized variants of `accumulate`, see:
+[`cumsum`](@ref), [`cumprod`](@ref)
+
+# Examples
+```jldoctest
+julia> accumulate(+, [1,2,3])
+3-element Array{Int64,1}:
+ 1
+ 3
+ 6
+
+julia> accumulate(*, [1,2,3])
+3-element Array{Int64,1}:
+ 1
+ 2
+ 6
+```
+"""
+accumulate(op, x::AbstractVector) = accumulate(op, x, 1)
+
+"""
+    accumulate!(op, B, A, dim::Integer)
+
+Cumulative operation `op` on `A` along the dimension `dim`, storing the result in `B`.
+See also [`accumulate`](@ref).
+
+# Examples
+```jldoctest
+julia> A = [1 2; 3 4];
+
+julia> B = [0 0; 0 0];
+
+julia> accumulate!(-, B, A, 1);
+
+julia> B
+2×2 Array{Int64,2}:
+  1   2
+ -2  -2
+
+julia> accumulate!(-, B, A, 2);
+
+julia> B
+2×2 Array{Int64,2}:
+ 1  -1
+ 3  -1
+```
+"""
+function accumulate!(op, B, A, dim::Integer)
+    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
+    inds_t = axes(A)
+    axes(B) == inds_t || throw(DimensionMismatch("shape of B must match A"))
+    dim > ndims(A) && return copyto!(B, A)
+    isempty(inds_t[dim]) && return B
+    if dim == 1
+        # We can accumulate to a temporary variable, which allows
+        # register usage and will be slightly faster
+        ind1 = inds_t[1]
+        @inbounds for I in CartesianIndices(tail(inds_t))
+            tmp = convert(eltype(B), A[first(ind1), I])
+            B[first(ind1), I] = tmp
+            for i_1 = first(ind1)+1:last(ind1)
+                tmp = op(tmp, A[i_1, I])
+                B[i_1, I] = tmp
+            end
+        end
+    else
+        R1 = CartesianIndices(axes(A)[1:dim-1])   # not type-stable
+        R2 = CartesianIndices(axes(A)[dim+1:end])
+        _accumulate!(op, B, A, R1, inds_t[dim], R2) # use function barrier
+    end
+    return B
+end
+
+"""
+    accumulate!(op, y, x::AbstractVector)
+
+Cumulative operation `op` on a vector `x`, storing the result in `y`.
+See also [`accumulate`](@ref).
+
+# Examples
+``jldoctest
+julia> x = [1, 0, 2, 0, 3];
+
+julia> y = [0, 0, 0, 0, 0];
+
+julia> accumulate!(+, y, x);
+
+julia> y
+5-element Array{Int64,1}:
+ 1
+ 1
+ 3
+ 3
+ 6
+```
+"""
+function accumulate!(op::Op, y, x::AbstractVector) where Op
+    isempty(x) && return y
+    v1 = first(x)
+    _accumulate1!(op, y, v1, x, 1)
+end
+
+@noinline function _accumulate!(op, B, A, R1, ind, R2)
+    # Copy the initial element in each 1d vector along dimension `dim`
+    ii = first(ind)
+    @inbounds for J in R2, I in R1
+        B[I, ii, J] = A[I, ii, J]
+    end
+    # Accumulate
+    @inbounds for J in R2, i in first(ind)+1:last(ind), I in R1
+        B[I, i, J] = op(B[I, i-1, J], A[I, i, J])
+    end
+    B
+end
+
+"""
+    accumulate(op, v0, x::AbstractVector)
+
+Like `accumulate`, but using a starting element `v0`. The first entry of the result will be
+`op(v0, first(A))`.
+
+# Examples
+```jldoctest
+julia> accumulate(+, 100, [1,2,3])
+3-element Array{Int64,1}:
+ 101
+ 103
+ 106
+
+julia> accumulate(min, 0, [1,2,-1])
+3-element Array{Int64,1}:
+  0
+  0
+ -1
+```
+"""
+function accumulate(op, v0, x::AbstractVector)
+    T = rcum_promote_type(op, typeof(v0), eltype(x))
+    out = similar(x, T)
+    accumulate!(op, out, v0, x)
+end
+
+function accumulate!(op, y, v0, x::AbstractVector)
+    isempty(x) && return y
+    v1 = op(v0, first(x))
+    _accumulate1!(op, y, v1, x, 1)
+end
+
+function _accumulate1!(op, B, v1, A::AbstractVector, dim::Integer)
+    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
+    inds = linearindices(A)
+    inds == linearindices(B) || throw(DimensionMismatch("linearindices of A and B don't match"))
+    dim > 1 && return copyto!(B, A)
+    i1 = inds[1]
+    cur_val = v1
+    B[i1] = cur_val
+    @inbounds for i in inds[2:end]
+        cur_val = op(cur_val, A[i])
+        B[i] = cur_val
+    end
+    return B
+end
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 824deb9c65898..c0d1f3638d798 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -688,402 +688,6 @@ _iterable(v) = Iterators.repeated(v)
     end
 end
 
-##
-
-# see discussion in #18364 ... we try not to widen type of the resulting array
-# from cumsum or cumprod, but in some cases (+, Bool) we may not have a choice.
-rcum_promote_type(op, ::Type{T}, ::Type{S}) where {T,S<:Number} = promote_op(op, T, S)
-rcum_promote_type(op, ::Type{T}) where {T<:Number} = rcum_promote_type(op, T,T)
-rcum_promote_type(op, ::Type{T}) where {T} = T
-
-# handle sums of Vector{Bool} and similar.   it would be nice to handle
-# any AbstractArray here, but it's not clear how that would be possible
-rcum_promote_type(op, ::Type{Array{T,N}}) where {T,N} = Array{rcum_promote_type(op,T), N}
-
-# accumulate_pairwise slightly slower then accumulate, but more numerically
-# stable in certain situations (e.g. sums).
-# it does double the number of operations compared to accumulate,
-# though for cheap operations like + this does not have much impact (20%)
-function _accumulate_pairwise!(op::Op, c::AbstractVector{T}, v::AbstractVector, s, i1, n)::T where {T,Op}
-    @inbounds if n < 128
-        s_ = v[i1]
-        c[i1] = op(s, s_)
-        for i = i1+1:i1+n-1
-            s_ = op(s_, v[i])
-            c[i] = op(s, s_)
-        end
-    else
-        n2 = n >> 1
-        s_ = _accumulate_pairwise!(op, c, v, s, i1, n2)
-        s_ = op(s_, _accumulate_pairwise!(op, c, v, op(s, s_), i1+n2, n-n2))
-    end
-    return s_
-end
-
-function accumulate_pairwise!(op::Op, result::AbstractVector, v::AbstractVector) where Op
-    li = linearindices(v)
-    li != linearindices(result) && throw(DimensionMismatch("input and output array sizes and indices must match"))
-    n = length(li)
-    n == 0 && return result
-    i1 = first(li)
-    @inbounds result[i1] = v1 = v[i1]
-    n == 1 && return result
-    _accumulate_pairwise!(op, result, v, v1, i1+1, n-1)
-    return result
-end
-
-function accumulate_pairwise(op, v::AbstractVector{T}) where T
-    out = similar(v, rcum_promote_type(op, T))
-    return accumulate_pairwise!(op, out, v)
-end
-
-function cumsum!(out, v::AbstractVector, dim::Integer)
-    # we dispatch on the possibility of numerical stability issues
-    _cumsum!(out, v, dim, ArithmeticStyle(eltype(out)))
-end
-
-function _cumsum!(out, v, dim, ::ArithmeticRounds)
-    dim == 1 ? accumulate_pairwise!(+, out, v) : copyto!(out, v)
-end
-function _cumsum!(out, v, dim, ::ArithmeticUnknown)
-    _cumsum!(out, v, dim, ArithmeticRounds())
-end
-function _cumsum!(out, v, dim, ::ArithmeticStyle)
-    dim == 1 ? accumulate!(+, out, v) : copyto!(out, v)
-end
-
-"""
-    cumsum(A, dim::Integer)
-
-Cumulative sum along the dimension `dim`. See also [`cumsum!`](@ref)
-to use a preallocated output array, both for performance and to control the precision of the
-output (e.g. to avoid overflow).
-
-# Examples
-```jldoctest
-julia> a = [1 2 3; 4 5 6]
-2×3 Array{Int64,2}:
- 1  2  3
- 4  5  6
-
-julia> cumsum(a,1)
-2×3 Array{Int64,2}:
- 1  2  3
- 5  7  9
-
-julia> cumsum(a,2)
-2×3 Array{Int64,2}:
- 1  3   6
- 4  9  15
-```
-"""
-function cumsum(A::AbstractArray{T}, dim::Integer) where T
-    out = similar(A, rcum_promote_type(+, T))
-    cumsum!(out, A, dim)
-end
-
-"""
-    cumsum(x::AbstractVector)
-
-Cumulative sum a vector. See also [`cumsum!`](@ref)
-to use a preallocated output array, both for performance and to control the precision of the
-output (e.g. to avoid overflow).
-
-# Examples
-```jldoctest
-julia> cumsum([1, 1, 1])
-3-element Array{Int64,1}:
- 1
- 2
- 3
-
-julia> cumsum([fill(1, 2) for i in 1:3])
-3-element Array{Array{Int64,1},1}:
- [1, 1]
- [2, 2]
- [3, 3]
-```
-"""
-cumsum(x::AbstractVector) = cumsum(x, 1)
-
-"""
-    cumsum!(B, A, dim::Integer)
-
-Cumulative sum of `A` along the dimension `dim`, storing the result in `B`. See also [`cumsum`](@ref).
-"""
-cumsum!(B, A, dim::Integer) = accumulate!(+, B, A, dim)
-
-"""
-    cumsum!(y::AbstractVector, x::AbstractVector)
-
-Cumulative sum of a vector `x`, storing the result in `y`. See also [`cumsum`](@ref).
-"""
-cumsum!(y::AbstractVector, x::AbstractVector) = cumsum!(y, x, 1)
-
-"""
-    cumprod(A, dim::Integer)
-
-Cumulative product along the dimension `dim`. See also
-[`cumprod!`](@ref) to use a preallocated output array, both for performance and
-to control the precision of the output (e.g. to avoid overflow).
-
-# Examples
-```jldoctest
-julia> a = [1 2 3; 4 5 6]
-2×3 Array{Int64,2}:
- 1  2  3
- 4  5  6
-
-julia> cumprod(a,1)
-2×3 Array{Int64,2}:
- 1   2   3
- 4  10  18
-
-julia> cumprod(a,2)
-2×3 Array{Int64,2}:
- 1   2    6
- 4  20  120
-```
-"""
-cumprod(A::AbstractArray, dim::Integer) = accumulate(*, A, dim)
-
-"""
-    cumprod(x::AbstractVector)
-
-Cumulative product of a vector. See also
-[`cumprod!`](@ref) to use a preallocated output array, both for performance and
-to control the precision of the output (e.g. to avoid overflow).
-
-# Examples
-```jldoctest
-julia> cumprod(fill(1//2, 3))
-3-element Array{Rational{Int64},1}:
- 1//2
- 1//4
- 1//8
-
-julia> cumprod([fill(1//3, 2, 2) for i in 1:3])
-3-element Array{Array{Rational{Int64},2},1}:
- Rational{Int64}[1//3 1//3; 1//3 1//3]
- Rational{Int64}[2//9 2//9; 2//9 2//9]
- Rational{Int64}[4//27 4//27; 4//27 4//27]
-```
-"""
-cumprod(x::AbstractVector) = cumprod(x, 1)
-
-"""
-    cumprod!(B, A, dim::Integer)
-
-Cumulative product of `A` along the dimension `dim`, storing the result in `B`.
-See also [`cumprod`](@ref).
-"""
-cumprod!(B, A, dim::Integer) = accumulate!(*, B, A, dim)
-
-"""
-    cumprod!(y::AbstractVector, x::AbstractVector)
-
-Cumulative product of a vector `x`, storing the result in `y`.
-See also [`cumprod`](@ref).
-"""
-cumprod!(y::AbstractVector, x::AbstractVector) = cumprod!(y, x, 1)
-
-"""
-    accumulate(op, A, dim::Integer)
-
-Cumulative operation `op` along the dimension `dim`. See also
-[`accumulate!`](@ref) to use a preallocated output array, both for performance and
-to control the precision of the output (e.g. to avoid overflow). For common operations
-there are specialized variants of `accumulate`, see:
-[`cumsum`](@ref), [`cumprod`](@ref)
-
-# Examples
-```jldoctest
-julia> accumulate(+, fill(1, 3, 3), 1)
-3×3 Array{Int64,2}:
- 1  1  1
- 2  2  2
- 3  3  3
-
-julia> accumulate(+, fill(1, 3, 3), 2)
-3×3 Array{Int64,2}:
- 1  2  3
- 1  2  3
- 1  2  3
-```
-"""
-function accumulate(op, A, dim::Integer)
-    out = similar(A, rcum_promote_type(op, eltype(A)))
-    accumulate!(op, out, A, dim)
-end
-
-"""
-    accumulate(op, x::AbstractVector)
-
-Cumulative operation `op` on a vector. See also
-[`accumulate!`](@ref) to use a preallocated output array, both for performance and
-to control the precision of the output (e.g. to avoid overflow). For common operations
-there are specialized variants of `accumulate`, see:
-[`cumsum`](@ref), [`cumprod`](@ref)
-
-# Examples
-```jldoctest
-julia> accumulate(+, [1,2,3])
-3-element Array{Int64,1}:
- 1
- 3
- 6
-
-julia> accumulate(*, [1,2,3])
-3-element Array{Int64,1}:
- 1
- 2
- 6
-```
-"""
-accumulate(op, x::AbstractVector) = accumulate(op, x, 1)
-
-"""
-    accumulate!(op, B, A, dim::Integer)
-
-Cumulative operation `op` on `A` along the dimension `dim`, storing the result in `B`.
-See also [`accumulate`](@ref).
-
-# Examples
-```jldoctest
-julia> A = [1 2; 3 4];
-
-julia> B = [0 0; 0 0];
-
-julia> accumulate!(-, B, A, 1);
-
-julia> B
-2×2 Array{Int64,2}:
-  1   2
- -2  -2
-
-julia> accumulate!(-, B, A, 2);
-
-julia> B
-2×2 Array{Int64,2}:
- 1  -1
- 3  -1
-```
-"""
-function accumulate!(op, B, A, dim::Integer)
-    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
-    inds_t = axes(A)
-    axes(B) == inds_t || throw(DimensionMismatch("shape of B must match A"))
-    dim > ndims(A) && return copyto!(B, A)
-    isempty(inds_t[dim]) && return B
-    if dim == 1
-        # We can accumulate to a temporary variable, which allows
-        # register usage and will be slightly faster
-        ind1 = inds_t[1]
-        @inbounds for I in CartesianIndices(tail(inds_t))
-            tmp = convert(eltype(B), A[first(ind1), I])
-            B[first(ind1), I] = tmp
-            for i_1 = first(ind1)+1:last(ind1)
-                tmp = op(tmp, A[i_1, I])
-                B[i_1, I] = tmp
-            end
-        end
-    else
-        R1 = CartesianIndices(axes(A)[1:dim-1])   # not type-stable
-        R2 = CartesianIndices(axes(A)[dim+1:end])
-        _accumulate!(op, B, A, R1, inds_t[dim], R2) # use function barrier
-    end
-    return B
-end
-
-"""
-    accumulate!(op, y, x::AbstractVector)
-
-Cumulative operation `op` on a vector `x`, storing the result in `y`.
-See also [`accumulate`](@ref).
-
-# Examples
-``jldoctest
-julia> x = [1, 0, 2, 0, 3];
-
-julia> y = [0, 0, 0, 0, 0];
-
-julia> accumulate!(+, y, x);
-
-julia> y
-5-element Array{Int64,1}:
- 1
- 1
- 3
- 3
- 6
-```
-"""
-function accumulate!(op::Op, y, x::AbstractVector) where Op
-    isempty(x) && return y
-    v1 = first(x)
-    _accumulate1!(op, y, v1, x, 1)
-end
-
-@noinline function _accumulate!(op, B, A, R1, ind, R2)
-    # Copy the initial element in each 1d vector along dimension `dim`
-    ii = first(ind)
-    @inbounds for J in R2, I in R1
-        B[I, ii, J] = A[I, ii, J]
-    end
-    # Accumulate
-    @inbounds for J in R2, i in first(ind)+1:last(ind), I in R1
-        B[I, i, J] = op(B[I, i-1, J], A[I, i, J])
-    end
-    B
-end
-
-"""
-    accumulate(op, v0, x::AbstractVector)
-
-Like `accumulate`, but using a starting element `v0`. The first entry of the result will be
-`op(v0, first(A))`.
-
-# Examples
-```jldoctest
-julia> accumulate(+, 100, [1,2,3])
-3-element Array{Int64,1}:
- 101
- 103
- 106
-
-julia> accumulate(min, 0, [1,2,-1])
-3-element Array{Int64,1}:
-  0
-  0
- -1
-```
-"""
-function accumulate(op, v0, x::AbstractVector)
-    T = rcum_promote_type(op, typeof(v0), eltype(x))
-    out = similar(x, T)
-    accumulate!(op, out, v0, x)
-end
-
-function accumulate!(op, y, v0, x::AbstractVector)
-    isempty(x) && return y
-    v1 = op(v0, first(x))
-    _accumulate1!(op, y, v1, x, 1)
-end
-
-function _accumulate1!(op, B, v1, A::AbstractVector, dim::Integer)
-    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
-    inds = linearindices(A)
-    inds == linearindices(B) || throw(DimensionMismatch("linearindices of A and B don't match"))
-    dim > 1 && return copyto!(B, A)
-    i1 = inds[1]
-    cur_val = v1
-    B[i1] = cur_val
-    @inbounds for i in inds[2:end]
-        cur_val = op(cur_val, A[i])
-        B[i] = cur_val
-    end
-    return B
-end
 
 ### from abstractarray.jl
 
diff --git a/base/sysimg.jl b/base/sysimg.jl
index 6f0d481464645..a708973325e93 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -360,6 +360,7 @@ const (∛)=cbrt
 INCLUDE_STATE = 2 # include = _include (from lines above)
 
 # reduction along dims
+include("accumulate.jl")
 include("reducedim.jl")  # macros in this file relies on string.jl
 
 # basic data structures

From 8bd8cae26285d665c6acc59753d63da0ffead556 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Tue, 30 Jan 2018 11:34:13 -0800
Subject: [PATCH 02/15] finish revamp

---
 NEWS.md            |   6 +-
 base/accumulate.jl | 630 ++++++++++++++++++++++++++-------------------
 base/deprecated.jl |   2 +
 base/reduce.jl     |  31 ++-
 test/arrayops.jl   |  17 +-
 5 files changed, 402 insertions(+), 284 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 9223278bead5d..c061e6659508c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -902,6 +902,9 @@ Deprecated or removed
     argument instead of defaulting to using the first dimension unless there is only
     one dimension ([#24684], [#25457]).
 
+  * `cumsum` and `cumprod` have the same promotion behaviour for small integer types as `sum` and
+    `prod`. Use `accumulate(+, x)`/`accumulate(*,x)` to get non-promoting behaviour ([#25766]).
+
   * The `sum_kbn` and `cumsum_kbn` functions have been moved to the
     [KahanSummation](https://github.com/JuliaMath/KahanSummation.jl) package ([#24869]).
 
@@ -1260,4 +1263,5 @@ Command-line option changes
 [#25622]: https://github.com/JuliaLang/julia/issues/25622
 [#25634]: https://github.com/JuliaLang/julia/issues/25634
 [#25654]: https://github.com/JuliaLang/julia/issues/25654
-[#25655]: https://github.com/JuliaLang/julia/issues/25655
\ No newline at end of file
+[#25655]: https://github.com/JuliaLang/julia/issues/25655
+[#25766]: https://github.com/JuliaLang/julia/issues/25766
\ No newline at end of file
diff --git a/base/accumulate.jl b/base/accumulate.jl
index 57284f6796813..1dd1907edf7ed 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -38,83 +38,367 @@ function next(acc::Accumulate, accstate)
     @_inline_meta
     itrstate, accval = accstate
     val, itrstate = next(acc.iter, itrstate)
-    if accval === uninitialized
-        accval = reduce_first(acc.op, val)
-        return accval, (itrstate, accval)
+
+    accval2 = reduce_first(acc.op, accval, val)
+    return accval2, (itrstate, accval2)
+end
+
+
+"""
+    accumulate(op[, v0], itr)
+
+Cumulative apply binary operation `op` on an iterable `itr`. If an initial `v0` value is
+provided, then the first value of the iterator will be
+
+    op(v0, first(iter))
+
+Otherwise it will use
+
+    Base.reduce_first(op, first(iter))
+
+See also: [`accumulate!`](@ref) to use a preallocated output array, and [`cumsum`](@ref),
+[`cumprod`](@ref) for specialized versions.
+
+# Examples
+```jldoctest
+julia> accumulate(+, [1,2,3])
+3-element Array{Int64,1}:
+ 1
+ 3
+ 6
+
+julia> accumulate(*, [1,2,3])
+3-element Array{Int64,1}:
+ 1
+ 2
+ 6
+
+julia> accumulate(+, 100, [1,2,3])
+3-element Array{Int64,1}:
+ 101
+ 103
+ 106
+
+julia> accumulate(min, 0, [1,2,-1])
+3-element Array{Int64,1}:
+  0
+  0
+ -1
+```
+"""
+accumulate(op, itr) = accumulate(op, uninitialized, itr)
+accumulate(op, v0, itr) = accumulate(op, v0, itr, IteratorSize(itr))
+accumulate(op, v0, itr, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}) =
+    collect(Accumulate(op, v0, itr))
+
+
+"""
+    accumulate!(dest, op[, v0] itr)
+
+Cumulative operation `op` on an iterator `itr`, storing the result in `dest`.
+See also [`accumulate`](@ref).
+
+# Examples
+``jldoctest
+julia> x = [1, 0, 2, 0, 3];
+
+julia> y = [0, 0, 0, 0, 0];
+
+julia> accumulate!(+, y, x);
+
+julia> y
+5-element Array{Int64,1}:
+ 1
+ 1
+ 3
+ 3
+ 6
+```
+"""
+accumulate!(dest, op, x) = accumulate!(dest, op, uninitialized, x)
+function accumulate!(dest, op, v0, x)
+    length(dest) == length(x) || throw(DimensionMismatch("input and output array sizes and indices must match"))
+    copyto!(dest, Accumulate(op, v0, x))
+end
+
+
+"""
+    accumulate(op[, v0], A, dim::Integer)
+
+Cumulative operation `op` along the dimension `dim`. See also
+[`accumulate!`](@ref) to use a preallocated output array. For common operations
+there are specialized variants of `accumulate`, see:
+[`cumsum`](@ref), [`cumprod`](@ref)
+
+# Examples
+```jldoctest
+julia> accumulate(+, fill(1, 3, 3), 1)
+3×3 Array{Int64,2}:
+ 1  1  1
+ 2  2  2
+ 3  3  3
+
+julia> accumulate(+, fill(1, 3, 3), 2)
+3×3 Array{Int64,2}:
+ 1  2  3
+ 1  2  3
+ 1  2  3
+```
+"""
+accumulate(op, X, dim::Int) = accumulate(op, uninitialized, X, dim)
+
+# split into (head, slice, tail) dimension iterators
+@inline function split_dimensions(X,dim::Integer)
+    inds = axes(X)
+    if dim > length(inds)
+        (CartesianIndices(inds), CartesianIndices(), CartesianIndices())
     else
-        accval = acc.op(accval, val)
-        return accval, (itrstate, accval)
+        (CartesianIndices(inds[1:dim-1]), inds[dim], CartesianIndices(inds[dim+1:end]))
     end
 end
 
-accumulate(op, itr) = collect(Accumulate(op, itr))
-accumulate(op, v0, itr) = collect(Accumulate(op, v0, itr))
-
-
-# TODO
-# below was copied directly from old code in multidimensional.jl
-
-# see discussion in #18364 ... we try not to widen type of the resulting array
-# from cumsum or cumprod, but in some cases (+, Bool) we may not have a choice.
-rcum_promote_type(op, ::Type{T}, ::Type{S}) where {T,S<:Number} = promote_op(op, T, S)
-rcum_promote_type(op, ::Type{T}) where {T<:Number} = rcum_promote_type(op, T,T)
-rcum_promote_type(op, ::Type{T}) where {T} = T
-
-# handle sums of Vector{Bool} and similar.   it would be nice to handle
-# any AbstractArray here, but it's not clear how that would be possible
-rcum_promote_type(op, ::Type{Array{T,N}}) where {T,N} = Array{rcum_promote_type(op,T), N}
-
-# accumulate_pairwise slightly slower then accumulate, but more numerically
-# stable in certain situations (e.g. sums).
-# it does double the number of operations compared to accumulate,
-# though for cheap operations like + this does not have much impact (20%)
-function _accumulate_pairwise!(op::Op, c::AbstractVector{T}, v::AbstractVector, s, i1, n)::T where {T,Op}
-    @inbounds if n < 128
-        s_ = v[i1]
-        c[i1] = op(s, s_)
-        for i = i1+1:i1+n-1
-            s_ = op(s_, v[i])
-            c[i] = op(s, s_)
-        end
-    else
-        n2 = n >> 1
-        s_ = _accumulate_pairwise!(op, c, v, s, i1, n2)
-        s_ = op(s_, _accumulate_pairwise!(op, c, v, op(s, s_), i1+n2, n-n2))
+function accumulate(op, v0, X, dim::Integer)
+    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
+    if length(X) == 0
+        # fallback on collect machinery
+        return collect(Accumulate(op, v0, X))
     end
-    return s_
+    indH, indD, indT = split_dimensions(X,dim)
+
+    # TODO: this could be much nicer with the new iteration protocol
+    # unroll loops to get first element
+    sH = start(indH)
+    @assert !done(indH, sH)
+    iH,sH = next(indH, sH)
+
+    sD = start(indD)
+    @assert !done(indD, sD)
+    iD,sD = next(indD, sD)
+    pD = iD
+
+    sT = start(indT)
+    @assert !done(indT, sT)
+    iT,sT = next(indT, sT)
+
+    # first element
+    accv = reduce_first(op, v0, X[iH, iD, iT])
+    dest = similar(X, typeof(accv))
+    i = first(linearindices(dest))
+    dest[i] = accv
+
+    return _accumulate!(dest, i, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
 end
 
-function accumulate_pairwise!(op::Op, result::AbstractVector, v::AbstractVector) where Op
-    li = linearindices(v)
-    li != linearindices(result) && throw(DimensionMismatch("input and output array sizes and indices must match"))
-    n = length(li)
-    n == 0 && return result
-    i1 = first(li)
-    @inbounds result[i1] = v1 = v[i1]
-    n == 1 && return result
-    _accumulate_pairwise!(op, result, v, v1, i1+1, n-1)
-    return result
+
+"""
+    accumulate!(dest, op, A, dim::Integer)
+
+Cumulative operation `op` on `A` along the dimension `dim`, storing the result in `B`.
+See also [`accumulate`](@ref).
+
+# Examples
+```jldoctest
+julia> A = [1 2; 3 4];
+
+julia> B = [0 0; 0 0];
+
+julia> accumulate!(-, B, A, 1);
+
+julia> B
+2×2 Array{Int64,2}:
+  1   2
+ -2  -2
+
+julia> accumulate!(-, B, A, 2);
+
+julia> B
+2×2 Array{Int64,2}:
+ 1  -1
+ 3  -1
+```
+"""
+accumulate!(dest, op, X, dim::Integer) = accumulate!(dest, op, uninitialized, X, dim::Integer)
+
+function accumulate!(dest, op, v0, X, dim::Integer)
+    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
+    axes(A) == axes(B) || throw(DimensionMismatch("shape of source and destination must match"))
+    indH, indD, indT = split_dimensions(X,dim)
+
+    sH = start(indH)
+    @assert !done(indH, sH)
+    iH,sH = next(indH, sH)
+
+    sD = start(indD)
+    @assert !done(indD, sD)
+    iD,sD = next(indD, sD)
+    pD = iD
+
+    sT = start(indT)
+    @assert !done(indT, sT)
+    iT,sT = next(indT, sT)
+
+    return _accumulate!(dest, first(linearindices(dest))-1, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, false)
 end
 
-function accumulate_pairwise(op, v::AbstractVector{T}) where T
-    out = similar(v, rcum_promote_type(op, T))
-    return accumulate_pairwise!(op, out, v)
+function _accumulate!(dest::AbstractArray{T}, i, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, widen=true) where {T}
+    while true
+        if done(indH,sH)
+            if done(indD, sD)
+                if done(indT,sT)
+                    return dest
+                else
+                    iT, sT = next(indT, sT)
+                end
+                iD, sD = next(indD, start(indD))
+                pD = iD
+            else
+                pD = iD
+                iD, sD = next(indD, sD)
+            end
+            iH, sH = next(indH, start(indH))
+        else
+            iH, sH = next(indH, sH)
+        end
+
+        if iD == pD
+            accv = reduce_first(op, v0, X[iH, iD, iT])
+        else
+            paccv = isempty(indH) ? accv : @inbounds dest[iH,pD,iT]
+            accv = op(paccv, X[iH,iD,iT])
+        end
+
+        S = typeof(accv)
+        if !widen || S === T || S <: T
+            @inbounds dest[i+=1] = accv
+        else
+            R = promote_typejoin(T, S)
+            new = similar(dest, R)
+            copyto!(new,1,dest,1,i)
+            @inbounds new[i+=1] = accv
+            return _accumulate!(dest, i, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
+        end
+    end
 end
 
-function cumsum!(out, v::AbstractVector, dim::Integer)
-    # we dispatch on the possibility of numerical stability issues
-    _cumsum!(out, v, dim, ArithmeticStyle(eltype(out)))
+"""
+    accumulate_pairwise(op[, v0], itr)
+
+Similar to [`accumulate`](@ref), but performs accumulation using a divide-and-conquer
+approach, which can be more numerically accurate for certain operations, such as
+[`+`](@ref).
+
+It involves roughly double the number of `op` calls, but for cheap operations like `+`
+this does not have much impact (approximately 20%).
+"""
+accumulate_pairwise(op, itr) = accumulate_pairwise(op, uninitialized, itr)
+accumulate_pairwise(op, v0, itr) =  accumulate_pairwise(op, v0, itr, IteratorSize(itr))
+function accumulate_pairwise(op, v0, itr, ::Union{HasLength,HasShape{1}})
+    i = start(itr)
+    if done(itr, i)
+        return collect(Accumulate(op, v0, itr))
+    end
+    v1, i = next(itr, i)
+    accv = reduce_first(op, v0, v1)
+
+    dest = _similar_for(1:1, typeof(accv), itr, IteratorSize(itr))
+    ld = linearindices(dest)
+    j = first(ld)
+    dest[j] = accv
+    m = last(ld) - j
+    if done(itr, i)
+        return dest
+    end
+    out, _ = _accumulate_pairwise!(dest, op, itr, accv, i, j, m, true)
+    return out
+end
+
+accumulate_pairwise!(dest, op, itr) = accumulate_pairwise!(dest, op, uninitialized, itr)
+function accumulate_pairwise!(dest, op, v0, itr)
+    linearindices(dest) == linearindices(itr) || throw(DimensionMismatch("length of source and destination must match"))
+
+    i = start(itr)
+    if done(itr, i)
+        return dest
+    end
+    v1, i = next(itr, i)
+    accv = reduce_first(op, v0, v1)
+
+    ld = linearindices(dest)
+    j = first(ld)
+    dest[j] = accv
+    m = last(ld) - j
+    if done(itr, i)
+        return dest
+    end
+    out, _ = _accumulate_pairwise!(dest, op, itr, accv, i, j, m, false)
+    return out
 end
 
-function _cumsum!(out, v, dim, ::ArithmeticRounds)
-    dim == 1 ? accumulate_pairwise!(+, out, v) : copyto!(out, v)
+# TODO figure out promotion machinery
+# collect_pairwise
+function _accumulate_pairwise!(dest::AbstractArray{T}, op, itr, accv, i, j, m, widen) where {T}
+    if m < 128
+        # m >= 1
+        w, i = next(itr, i)
+        y = op(accv, w)
+        S = typeof(y)
+        if !widen || S === T || S<:T
+            @inbounds dest[j+=1] = y
+            return _accumulate_pairwise_small!(dest, op, itr, accv, w, i, j, m-1, widen)
+        else
+            R = promote_typejoin(T, S)
+            new = similar(dest, R)
+            copyto!(new,1,dest,1,j)
+            @inbounds new[j+=1] = y
+            return _accumulate_pairwise_small!(new, op, itr, accv, w, i, j, m-1, widen)
+        end
+    else
+        m1 = m >> 1
+        m2 = m - m1
+        dest, taccv1, i, j = _accumulate_pairwise!(dest, op, itr, accv, i, j, m1, widen)
+        dest, taccv2, i, j = _accumulate_pairwise!(dest, op, itr, op(accv, taccv1), i, j, m2, widen)
+        taccv = op(taccv1, taccv2)
+    end
+    return dest, taccv, i, j
 end
-function _cumsum!(out, v, dim, ::ArithmeticUnknown)
-    _cumsum!(out, v, dim, ArithmeticRounds())
+
+function _accumulate_pairwise_small!(dest::AbstractArray{T}, op, itr, accv, w, i, j, m, widen) where T
+    if m == 0
+        return dest, reduce_first(op, w), i, j
+    end
+    v, i = next(itr, i)
+    x = op(w, v)
+    while true
+        y = op(accv, x)
+        S = typeof(y)
+        if !widen || S === T || S<:T
+            @inbounds dest[j+=1] = y
+            m -= 1
+        else
+            R = promote_typejoin(T, S)
+            new = similar(dest, R)
+            copyto!(new,1,dest,1,j)
+            @inbounds new[j+=1] = y
+            return _accumulate_pairwise_small!(new, op, itr, accv, x, i, j, m-1, widen)
+        end
+        if m == 0
+            return dest, x, i, j
+        end
+
+        v, i = next(itr, i)
+        x = op(x, v)
+    end
 end
-function _cumsum!(out, v, dim, ::ArithmeticStyle)
-    dim == 1 ? accumulate!(+, out, v) : copyto!(out, v)
+
+
+
+function cumsum!(out, v::AbstractVector{T}) where T
+    # we dispatch on the possibility of numerical accuracy issues
+    cumsum!(out, v, ArithmeticStyle(T))
 end
+cumsum!(out, v::AbstractVector, ::ArithmeticRounds)  = accumulate_pairwise!(out, +, v)
+cumsum!(out, v::AbstractVector, ::ArithmeticUnknown) = accumulate_pairwise!(out, +, v)
+cumsum!(out, v::AbstractVector, ::ArithmeticStyle)   = accumulate!(out, +, v)
 
 """
     cumsum(A, dim::Integer)
@@ -141,10 +425,7 @@ julia> cumsum(a,2)
  4  9  15
 ```
 """
-function cumsum(A::AbstractArray{T}, dim::Integer) where T
-    out = similar(A, rcum_promote_type(+, T))
-    cumsum!(out, A, dim)
-end
+cumsum(A, dim::Integer) = accumulate(add_sum, A, dim)
 
 """
     cumsum(x::AbstractVector)
@@ -168,21 +449,27 @@ julia> cumsum([fill(1, 2) for i in 1:3])
  [3, 3]
 ```
 """
-cumsum(x::AbstractVector) = cumsum(x, 1)
+function cumsum(v::AbstractVector{T}) where T
+    # we dispatch on the possibility of numerical accuracy issues
+    cumsum(v, ArithmeticStyle(T))
+end
+cumsum(v::AbstractVector, ::ArithmeticRounds)  = accumulate_pairwise(add_sum, v)
+cumsum(v::AbstractVector, ::ArithmeticUnknown) = accumulate_pairwise(add_sum, v)
+cumsum(v::AbstractVector, ::ArithmeticStyle)   = accumulate(add_sum, v)
 
 """
     cumsum!(B, A, dim::Integer)
 
 Cumulative sum of `A` along the dimension `dim`, storing the result in `B`. See also [`cumsum`](@ref).
 """
-cumsum!(B, A, dim::Integer) = accumulate!(+, B, A, dim)
+cumsum!(dest, A, dim::Integer) = accumulate!(dest, +, A, dim)
 
 """
     cumsum!(y::AbstractVector, x::AbstractVector)
 
 Cumulative sum of a vector `x`, storing the result in `y`. See also [`cumsum`](@ref).
 """
-cumsum!(y::AbstractVector, x::AbstractVector) = cumsum!(y, x, 1)
+cumsum!(dest, itr) = accumulate!(dest, +, src)
 
 """
     cumprod(A, dim::Integer)
@@ -209,7 +496,7 @@ julia> cumprod(a,2)
  4  20  120
 ```
 """
-cumprod(A::AbstractArray, dim::Integer) = accumulate(*, A, dim)
+cumprod(A, dim::Integer) = accumulate(mul_prod, A, dim)
 
 """
     cumprod(x::AbstractVector)
@@ -233,7 +520,8 @@ julia> cumprod([fill(1//3, 2, 2) for i in 1:3])
  Rational{Int64}[4//27 4//27; 4//27 4//27]
 ```
 """
-cumprod(x::AbstractVector) = cumprod(x, 1)
+cumprod(x) = accumulate(mul_prod, x)
+cumprod(x::AbstractVector) = accumulate(mul_prod, x)
 
 """
     cumprod!(B, A, dim::Integer)
@@ -241,7 +529,7 @@ cumprod(x::AbstractVector) = cumprod(x, 1)
 Cumulative product of `A` along the dimension `dim`, storing the result in `B`.
 See also [`cumprod`](@ref).
 """
-cumprod!(B, A, dim::Integer) = accumulate!(*, B, A, dim)
+cumprod!(dest, A, dim::Integer) = accumulate!(dest, *, A, dim)
 
 """
     cumprod!(y::AbstractVector, x::AbstractVector)
@@ -249,202 +537,4 @@ cumprod!(B, A, dim::Integer) = accumulate!(*, B, A, dim)
 Cumulative product of a vector `x`, storing the result in `y`.
 See also [`cumprod`](@ref).
 """
-cumprod!(y::AbstractVector, x::AbstractVector) = cumprod!(y, x, 1)
-
-"""
-    accumulate(op, A, dim::Integer)
-
-Cumulative operation `op` along the dimension `dim`. See also
-[`accumulate!`](@ref) to use a preallocated output array, both for performance and
-to control the precision of the output (e.g. to avoid overflow). For common operations
-there are specialized variants of `accumulate`, see:
-[`cumsum`](@ref), [`cumprod`](@ref)
-
-# Examples
-```jldoctest
-julia> accumulate(+, fill(1, 3, 3), 1)
-3×3 Array{Int64,2}:
- 1  1  1
- 2  2  2
- 3  3  3
-
-julia> accumulate(+, fill(1, 3, 3), 2)
-3×3 Array{Int64,2}:
- 1  2  3
- 1  2  3
- 1  2  3
-```
-"""
-function accumulate(op, A, dim::Integer)
-    out = similar(A, rcum_promote_type(op, eltype(A)))
-    accumulate!(op, out, A, dim)
-end
-
-"""
-    accumulate(op, x::AbstractVector)
-
-Cumulative operation `op` on a vector. See also
-[`accumulate!`](@ref) to use a preallocated output array, both for performance and
-to control the precision of the output (e.g. to avoid overflow). For common operations
-there are specialized variants of `accumulate`, see:
-[`cumsum`](@ref), [`cumprod`](@ref)
-
-# Examples
-```jldoctest
-julia> accumulate(+, [1,2,3])
-3-element Array{Int64,1}:
- 1
- 3
- 6
-
-julia> accumulate(*, [1,2,3])
-3-element Array{Int64,1}:
- 1
- 2
- 6
-```
-"""
-accumulate(op, x::AbstractVector) = accumulate(op, x, 1)
-
-"""
-    accumulate!(op, B, A, dim::Integer)
-
-Cumulative operation `op` on `A` along the dimension `dim`, storing the result in `B`.
-See also [`accumulate`](@ref).
-
-# Examples
-```jldoctest
-julia> A = [1 2; 3 4];
-
-julia> B = [0 0; 0 0];
-
-julia> accumulate!(-, B, A, 1);
-
-julia> B
-2×2 Array{Int64,2}:
-  1   2
- -2  -2
-
-julia> accumulate!(-, B, A, 2);
-
-julia> B
-2×2 Array{Int64,2}:
- 1  -1
- 3  -1
-```
-"""
-function accumulate!(op, B, A, dim::Integer)
-    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
-    inds_t = axes(A)
-    axes(B) == inds_t || throw(DimensionMismatch("shape of B must match A"))
-    dim > ndims(A) && return copyto!(B, A)
-    isempty(inds_t[dim]) && return B
-    if dim == 1
-        # We can accumulate to a temporary variable, which allows
-        # register usage and will be slightly faster
-        ind1 = inds_t[1]
-        @inbounds for I in CartesianIndices(tail(inds_t))
-            tmp = convert(eltype(B), A[first(ind1), I])
-            B[first(ind1), I] = tmp
-            for i_1 = first(ind1)+1:last(ind1)
-                tmp = op(tmp, A[i_1, I])
-                B[i_1, I] = tmp
-            end
-        end
-    else
-        R1 = CartesianIndices(axes(A)[1:dim-1])   # not type-stable
-        R2 = CartesianIndices(axes(A)[dim+1:end])
-        _accumulate!(op, B, A, R1, inds_t[dim], R2) # use function barrier
-    end
-    return B
-end
-
-"""
-    accumulate!(op, y, x::AbstractVector)
-
-Cumulative operation `op` on a vector `x`, storing the result in `y`.
-See also [`accumulate`](@ref).
-
-# Examples
-``jldoctest
-julia> x = [1, 0, 2, 0, 3];
-
-julia> y = [0, 0, 0, 0, 0];
-
-julia> accumulate!(+, y, x);
-
-julia> y
-5-element Array{Int64,1}:
- 1
- 1
- 3
- 3
- 6
-```
-"""
-function accumulate!(op::Op, y, x::AbstractVector) where Op
-    isempty(x) && return y
-    v1 = first(x)
-    _accumulate1!(op, y, v1, x, 1)
-end
-
-@noinline function _accumulate!(op, B, A, R1, ind, R2)
-    # Copy the initial element in each 1d vector along dimension `dim`
-    ii = first(ind)
-    @inbounds for J in R2, I in R1
-        B[I, ii, J] = A[I, ii, J]
-    end
-    # Accumulate
-    @inbounds for J in R2, i in first(ind)+1:last(ind), I in R1
-        B[I, i, J] = op(B[I, i-1, J], A[I, i, J])
-    end
-    B
-end
-
-"""
-    accumulate(op, v0, x::AbstractVector)
-
-Like `accumulate`, but using a starting element `v0`. The first entry of the result will be
-`op(v0, first(A))`.
-
-# Examples
-```jldoctest
-julia> accumulate(+, 100, [1,2,3])
-3-element Array{Int64,1}:
- 101
- 103
- 106
-
-julia> accumulate(min, 0, [1,2,-1])
-3-element Array{Int64,1}:
-  0
-  0
- -1
-```
-"""
-function accumulate(op, v0, x::AbstractVector)
-    T = rcum_promote_type(op, typeof(v0), eltype(x))
-    out = similar(x, T)
-    accumulate!(op, out, v0, x)
-end
-
-function accumulate!(op, y, v0, x::AbstractVector)
-    isempty(x) && return y
-    v1 = op(v0, first(x))
-    _accumulate1!(op, y, v1, x, 1)
-end
-
-function _accumulate1!(op, B, v1, A::AbstractVector, dim::Integer)
-    dim > 0 || throw(ArgumentError("dim must be a positive integer"))
-    inds = linearindices(A)
-    inds == linearindices(B) || throw(DimensionMismatch("linearindices of A and B don't match"))
-    dim > 1 && return copyto!(B, A)
-    i1 = inds[1]
-    cur_val = v1
-    B[i1] = cur_val
-    @inbounds for i in inds[2:end]
-        cur_val = op(cur_val, A[i])
-        B[i] = cur_val
-    end
-    return B
-end
+cumprod!(dest, itr) = accumulate!(dest, *, itr)
diff --git a/base/deprecated.jl b/base/deprecated.jl
index fca93469dacd0..eff0e210e4301 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -1407,6 +1407,8 @@ end
 
 @deprecate which(s::Symbol) which(Main, s)
 
+@deprecate accumulate!(op, dest::AbstractArray, args...) accumulate!(dest, op, args...)
+
 # END 0.7 deprecations
 
 # BEGIN 1.0 deprecations
diff --git a/base/reduce.jl b/base/reduce.jl
index b60f82d817ae8..cb5b8fdf870d6 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -21,6 +21,12 @@ integers are promoted to `Int`/`UInt`.
 add_sum(x,y) = x + y
 add_sum(x::SmallSigned,y::SmallSigned) = Int(x) + Int(y)
 add_sum(x::SmallUnsigned,y::SmallUnsigned) = UInt(x) + UInt(y)
+add_sum(X::AbstractArray,Y::AbstractArray) = (promote_shape(X,Y); broadcast(add_sum, X, Y))
+
+add_sum(x) = +(x)
+add_sum(x::SmallSigned) = Int(x)
+add_sum(x::SmallUnsigned) = UInt(x)
+add_sum(X::AbstractArray) = broadcast(add_sum, X)
 
 """
     Base.mul_prod(x,y)
@@ -32,6 +38,10 @@ mul_prod(x,y) = x * y
 mul_prod(x::SmallSigned,y::SmallSigned) = Int(x) * Int(y)
 mul_prod(x::SmallUnsigned,y::SmallUnsigned) = UInt(x) * UInt(y)
 
+mul_prod(x) = *(x)
+mul_prod(x::SmallSigned) = Int(x)
+mul_prod(x::SmallUnsigned) = UInt(x)
+
 ## foldl && mapfoldl
 
 @noinline function mapfoldl_impl(f, op, v0, itr, i)
@@ -301,20 +311,21 @@ The value to be returned when calling [`reduce`](@ref), [`foldl`](@ref`) or
 `x`. This value may also used to initialise the recursion, so that `reduce(op, [x, y])`
 may call `op(reduce_first(op, x), y)`.
 
-The default is `x` for most types. The main purpose is to ensure type stability, so
+The default is `x` for most operators. The main purpose is to ensure type stability, so
 additional methods should only be defined for cases where `op` gives a result with
 different types than its inputs.
 """
 reduce_first(op, x) = x
-reduce_first(::typeof(+), x::Bool) = Int(x)
-reduce_first(::typeof(*), x::Char) = string(x)
-
-reduce_first(::typeof(add_sum), x) = reduce_first(+, x)
-reduce_first(::typeof(add_sum), x::SmallSigned)   = Int(x)
-reduce_first(::typeof(add_sum), x::SmallUnsigned) = UInt(x)
-reduce_first(::typeof(mul_prod), x) = reduce_first(*, x)
-reduce_first(::typeof(mul_prod), x::SmallSigned)   = Int(x)
-reduce_first(::typeof(mul_prod), x::SmallUnsigned) = UInt(x)
+
+# for existing operators we can use unary forms which already handle promotion correctly
+reduce_first(::typeof(+), x) = +(x)
+reduce_first(::typeof(*), x) = *(x)
+reduce_first(::typeof(add_sum), x) = add_sum(x)
+reduce_first(::typeof(mul_prod), x) = mul_prod(x)
+
+# called by accumulate, which uses `uninitialized` as a sentinel value
+reduce_first(op, v0, x) = op(v0,x)
+reduce_first(op, ::Uninitialized, x) = reduce_first(op, x)
 
 """
     Base.mapreduce_first(f, op, x)
diff --git a/test/arrayops.jl b/test/arrayops.jl
index 16b27ac769707..1aa83dde5225f 100644
--- a/test/arrayops.jl
+++ b/test/arrayops.jl
@@ -1984,9 +1984,10 @@ end
 # issue #18363
 @test_throws DimensionMismatch cumsum!([0,0], 1:4)
 @test cumsum(Any[])::Vector{Any} == Any[]
-@test cumsum(Any[1, 2.3])::Vector{Any} == [1, 3.3] == cumsum(Real[1, 2.3])::Vector{Real}
+@test cumsum(Any[1, 2.3]) == [1, 3.3] == cumsum(Real[1, 2.3])::Vector{Real}
 @test cumsum([true,true,true]) == [1,2,3]
-@test cumsum(0x00:0xff)[end] === 0x80 # overflow
+@test cumsum(0x00:0xff)[end] === UInt(255*(255+1)÷2) # no overflow
+@test accumulate(+, 0x00:0xff)[end] === 0x80         # overflow
 @test cumsum([[true], [true], [false]])::Vector{Vector{Int}} == [[1], [2], [2]]
 
 #issue #18336
@@ -2169,7 +2170,7 @@ end
 
             if eltype(arr) in [Int, Float64] # eltype of out easy
                 out = similar(arr)
-                @test accumulate!(op, out, arr) ≈ accumulate_arr
+                @test accumulate!(out, op, arr) ≈ accumulate_arr
                 @test out ≈ accumulate_arr
             end
         end
@@ -2195,6 +2196,8 @@ end
 
 Base.ArithmeticStyle(::Type{F21666{T}}) where {T} = T()
 Base.:+(x::F, y::F) where {F <: F21666} = F(x.x + y.x)
+Base.:+(x::F) where {F <: F21666} = x
+
 Float64(x::F21666) = Float64(x.x)
 @testset "Exactness of cumsum # 21666" begin
     # test that cumsum uses more stable algorithm
@@ -2307,3 +2310,11 @@ end
     inds_b = Base.Indices{1}([1:3])
     @test Base.promote_shape(inds_a, inds_b) == Base.promote_shape(inds_b, inds_a)
 end
+
+@testset "accumulate on non-numeric types" begin
+    @test accumulate((acc, x) -> acc+x[1], 0, [(1,2), (3,4), (5,6)]) == [1, 4, 9]
+    @test accumulate(*, ['a', 'b']) == ["a", "ab"]
+    @inferred accumulate(*, String[])
+    @test accumulate(*, ['a' 'b'; 'c' 'd'], 1) == ["a" "b"; "ac" "bd"]
+    @test accumulate(*, ['a' 'b'; 'c' 'd'], 2) == ["a" "ab"; "c" "cd"]
+end

From 7ab878e15cb205c2340475f85cd929d4132ac2d5 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Tue, 30 Jan 2018 21:38:38 -0800
Subject: [PATCH 03/15] unroll accumulate! to avoid type instability

---
 base/accumulate.jl | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index 1dd1907edf7ed..505ba05a2956a 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -117,8 +117,34 @@ julia> y
 """
 accumulate!(dest, op, x) = accumulate!(dest, op, uninitialized, x)
 function accumulate!(dest, op, v0, x)
-    length(dest) == length(x) || throw(DimensionMismatch("input and output array sizes and indices must match"))
-    copyto!(dest, Accumulate(op, v0, x))
+    src = Accumulate(op, v0, x)
+
+    # this is essentially `copyto!`, but unrolled to deal with potential type instability in first state
+    # hopefully won't be necessary with better Union splitting
+    destiter = eachindex(dest)
+
+    sstate0 = start(src)
+    dstate = start(destiter)
+    sdone = done(src, sstate0)
+    ddone = done(destiter, dstate)
+
+    sdone && ddone && return dest
+    (sdone || ddone) && throw(DimensionMismatch("input and output array sizes and indices must match"))
+
+    i, dstate = next(destiter, dstate)
+    s, sstate = next(src, sstate0)
+    while true
+        @inbounds dest[i] = s
+
+        sdone = done(src, sstate)
+        ddone = done(destiter, dstate)
+
+        sdone && ddone && return dest
+        (sdone || ddone) && throw(DimensionMismatch("input and output array sizes and indices must match"))
+
+        i, dstate = next(destiter, dstate)
+        s, sstate = next(src, sstate)
+    end
 end
 
 

From 3581f97eff4d1cb4882f3d5047ff916ddf21a115 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Wed, 31 Jan 2018 09:25:36 -0800
Subject: [PATCH 04/15] revert argument order change

---
 base/accumulate.jl | 58 +++++++++++++++++++++++-----------------------
 base/deprecated.jl |  2 --
 test/arrayops.jl   |  2 +-
 3 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index 505ba05a2956a..fafd0cfb6b9cc 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -93,7 +93,7 @@ accumulate(op, v0, itr, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}) =
 
 
 """
-    accumulate!(dest, op[, v0] itr)
+    accumulate!(op, dest,[, v0] itr)
 
 Cumulative operation `op` on an iterator `itr`, storing the result in `dest`.
 See also [`accumulate`](@ref).
@@ -115,8 +115,8 @@ julia> y
  6
 ```
 """
-accumulate!(dest, op, x) = accumulate!(dest, op, uninitialized, x)
-function accumulate!(dest, op, v0, x)
+accumulate!(op, dest, x) = accumulate!(op, dest, uninitialized, x)
+function accumulate!(op, dest, v0, x)
     src = Accumulate(op, v0, x)
 
     # this is essentially `copyto!`, but unrolled to deal with potential type instability in first state
@@ -212,12 +212,12 @@ function accumulate(op, v0, X, dim::Integer)
     i = first(linearindices(dest))
     dest[i] = accv
 
-    return _accumulate!(dest, i, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
+    return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
 end
 
 
 """
-    accumulate!(dest, op, A, dim::Integer)
+    accumulate!(op, dest, A, dim::Integer)
 
 Cumulative operation `op` on `A` along the dimension `dim`, storing the result in `B`.
 See also [`accumulate`](@ref).
@@ -228,7 +228,7 @@ julia> A = [1 2; 3 4];
 
 julia> B = [0 0; 0 0];
 
-julia> accumulate!(-, B, A, 1);
+julia> accumulate!(B, -, A, 1);
 
 julia> B
 2×2 Array{Int64,2}:
@@ -243,9 +243,9 @@ julia> B
  3  -1
 ```
 """
-accumulate!(dest, op, X, dim::Integer) = accumulate!(dest, op, uninitialized, X, dim::Integer)
+accumulate!(op, dest, X, dim::Integer) = accumulate!(op, dest, uninitialized, X, dim::Integer)
 
-function accumulate!(dest, op, v0, X, dim::Integer)
+function accumulate!(op, dest, v0, X, dim::Integer)
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
     axes(A) == axes(B) || throw(DimensionMismatch("shape of source and destination must match"))
     indH, indD, indT = split_dimensions(X,dim)
@@ -263,10 +263,10 @@ function accumulate!(dest, op, v0, X, dim::Integer)
     @assert !done(indT, sT)
     iT,sT = next(indT, sT)
 
-    return _accumulate!(dest, first(linearindices(dest))-1, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, false)
+    return _accumulate!(op, dest, first(linearindices(dest))-1, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, false)
 end
 
-function _accumulate!(dest::AbstractArray{T}, i, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, widen=true) where {T}
+function _accumulate!(op, dest::AbstractArray{T}, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, widen=true) where {T}
     while true
         if done(indH,sH)
             if done(indD, sD)
@@ -301,7 +301,7 @@ function _accumulate!(dest::AbstractArray{T}, i, op, v0, X, indH, indD, indT, sH
             new = similar(dest, R)
             copyto!(new,1,dest,1,i)
             @inbounds new[i+=1] = accv
-            return _accumulate!(dest, i, op, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
+            return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
         end
     end
 end
@@ -334,12 +334,12 @@ function accumulate_pairwise(op, v0, itr, ::Union{HasLength,HasShape{1}})
     if done(itr, i)
         return dest
     end
-    out, _ = _accumulate_pairwise!(dest, op, itr, accv, i, j, m, true)
+    out, _ = _accumulate_pairwise!(op, dest, itr, accv, i, j, m, true)
     return out
 end
 
-accumulate_pairwise!(dest, op, itr) = accumulate_pairwise!(dest, op, uninitialized, itr)
-function accumulate_pairwise!(dest, op, v0, itr)
+accumulate_pairwise!(op, dest, itr) = accumulate_pairwise!(op, dest, uninitialized, itr)
+function accumulate_pairwise!(op, dest, v0, itr)
     linearindices(dest) == linearindices(itr) || throw(DimensionMismatch("length of source and destination must match"))
 
     i = start(itr)
@@ -356,13 +356,13 @@ function accumulate_pairwise!(dest, op, v0, itr)
     if done(itr, i)
         return dest
     end
-    out, _ = _accumulate_pairwise!(dest, op, itr, accv, i, j, m, false)
+    out, _ = _accumulate_pairwise!(op, dest, itr, accv, i, j, m, false)
     return out
 end
 
 # TODO figure out promotion machinery
 # collect_pairwise
-function _accumulate_pairwise!(dest::AbstractArray{T}, op, itr, accv, i, j, m, widen) where {T}
+function _accumulate_pairwise!(op, dest::AbstractArray{T}, itr, accv, i, j, m, widen) where {T}
     if m < 128
         # m >= 1
         w, i = next(itr, i)
@@ -370,25 +370,25 @@ function _accumulate_pairwise!(dest::AbstractArray{T}, op, itr, accv, i, j, m, w
         S = typeof(y)
         if !widen || S === T || S<:T
             @inbounds dest[j+=1] = y
-            return _accumulate_pairwise_small!(dest, op, itr, accv, w, i, j, m-1, widen)
+            return _accumulate_pairwise_small!(op, dest, itr, accv, w, i, j, m-1, widen)
         else
             R = promote_typejoin(T, S)
             new = similar(dest, R)
             copyto!(new,1,dest,1,j)
             @inbounds new[j+=1] = y
-            return _accumulate_pairwise_small!(new, op, itr, accv, w, i, j, m-1, widen)
+            return _accumulate_pairwise_small!(op, new, itr, accv, w, i, j, m-1, widen)
         end
     else
         m1 = m >> 1
         m2 = m - m1
-        dest, taccv1, i, j = _accumulate_pairwise!(dest, op, itr, accv, i, j, m1, widen)
-        dest, taccv2, i, j = _accumulate_pairwise!(dest, op, itr, op(accv, taccv1), i, j, m2, widen)
+        dest, taccv1, i, j = _accumulate_pairwise!(op, dest, itr, accv, i, j, m1, widen)
+        dest, taccv2, i, j = _accumulate_pairwise!(op, dest, itr, op(accv, taccv1), i, j, m2, widen)
         taccv = op(taccv1, taccv2)
     end
     return dest, taccv, i, j
 end
 
-function _accumulate_pairwise_small!(dest::AbstractArray{T}, op, itr, accv, w, i, j, m, widen) where T
+function _accumulate_pairwise_small!(op, dest::AbstractArray{T}, itr, accv, w, i, j, m, widen) where T
     if m == 0
         return dest, reduce_first(op, w), i, j
     end
@@ -405,7 +405,7 @@ function _accumulate_pairwise_small!(dest::AbstractArray{T}, op, itr, accv, w, i
             new = similar(dest, R)
             copyto!(new,1,dest,1,j)
             @inbounds new[j+=1] = y
-            return _accumulate_pairwise_small!(new, op, itr, accv, x, i, j, m-1, widen)
+            return _accumulate_pairwise_small!(op, new, itr, accv, x, i, j, m-1, widen)
         end
         if m == 0
             return dest, x, i, j
@@ -422,9 +422,9 @@ function cumsum!(out, v::AbstractVector{T}) where T
     # we dispatch on the possibility of numerical accuracy issues
     cumsum!(out, v, ArithmeticStyle(T))
 end
-cumsum!(out, v::AbstractVector, ::ArithmeticRounds)  = accumulate_pairwise!(out, +, v)
-cumsum!(out, v::AbstractVector, ::ArithmeticUnknown) = accumulate_pairwise!(out, +, v)
-cumsum!(out, v::AbstractVector, ::ArithmeticStyle)   = accumulate!(out, +, v)
+cumsum!(out, v::AbstractVector, ::ArithmeticRounds)  = accumulate_pairwise!(+, out, v)
+cumsum!(out, v::AbstractVector, ::ArithmeticUnknown) = accumulate_pairwise!(+, out, v)
+cumsum!(out, v::AbstractVector, ::ArithmeticStyle)   = accumulate!(+, out, v)
 
 """
     cumsum(A, dim::Integer)
@@ -488,14 +488,14 @@ cumsum(v::AbstractVector, ::ArithmeticStyle)   = accumulate(add_sum, v)
 
 Cumulative sum of `A` along the dimension `dim`, storing the result in `B`. See also [`cumsum`](@ref).
 """
-cumsum!(dest, A, dim::Integer) = accumulate!(dest, +, A, dim)
+cumsum!(dest, A, dim::Integer) = accumulate!(+, dest, A, dim)
 
 """
     cumsum!(y::AbstractVector, x::AbstractVector)
 
 Cumulative sum of a vector `x`, storing the result in `y`. See also [`cumsum`](@ref).
 """
-cumsum!(dest, itr) = accumulate!(dest, +, src)
+cumsum!(dest, itr) = accumulate!(+, dest, src)
 
 """
     cumprod(A, dim::Integer)
@@ -555,7 +555,7 @@ cumprod(x::AbstractVector) = accumulate(mul_prod, x)
 Cumulative product of `A` along the dimension `dim`, storing the result in `B`.
 See also [`cumprod`](@ref).
 """
-cumprod!(dest, A, dim::Integer) = accumulate!(dest, *, A, dim)
+cumprod!(dest, A, dim::Integer) = accumulate!(*, dest, A, dim)
 
 """
     cumprod!(y::AbstractVector, x::AbstractVector)
@@ -563,4 +563,4 @@ cumprod!(dest, A, dim::Integer) = accumulate!(dest, *, A, dim)
 Cumulative product of a vector `x`, storing the result in `y`.
 See also [`cumprod`](@ref).
 """
-cumprod!(dest, itr) = accumulate!(dest, *, itr)
+cumprod!(dest, itr) = accumulate!(*, dest, itr)
diff --git a/base/deprecated.jl b/base/deprecated.jl
index eff0e210e4301..fca93469dacd0 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -1407,8 +1407,6 @@ end
 
 @deprecate which(s::Symbol) which(Main, s)
 
-@deprecate accumulate!(op, dest::AbstractArray, args...) accumulate!(dest, op, args...)
-
 # END 0.7 deprecations
 
 # BEGIN 1.0 deprecations
diff --git a/test/arrayops.jl b/test/arrayops.jl
index 1aa83dde5225f..775f9b8a85792 100644
--- a/test/arrayops.jl
+++ b/test/arrayops.jl
@@ -2170,7 +2170,7 @@ end
 
             if eltype(arr) in [Int, Float64] # eltype of out easy
                 out = similar(arr)
-                @test accumulate!(out, op, arr) ≈ accumulate_arr
+                @test accumulate!(op, out, arr) ≈ accumulate_arr
                 @test out ≈ accumulate_arr
             end
         end

From 512fbcdae0243f3311b74cf491c594c53920513b Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Wed, 31 Jan 2018 09:27:07 -0800
Subject: [PATCH 05/15] fix dimension check

---
 base/accumulate.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index fafd0cfb6b9cc..20363362f73c7 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -247,7 +247,7 @@ accumulate!(op, dest, X, dim::Integer) = accumulate!(op, dest, uninitialized, X,
 
 function accumulate!(op, dest, v0, X, dim::Integer)
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
-    axes(A) == axes(B) || throw(DimensionMismatch("shape of source and destination must match"))
+    axes(dest) == axes(X) || throw(DimensionMismatch("shape of source and destination must match"))
     indH, indD, indT = split_dimensions(X,dim)
 
     sH = start(indH)

From c5f5d47eba00577bb31059a89d50bfbf20cbe784 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Wed, 31 Jan 2018 10:02:01 -0800
Subject: [PATCH 06/15] Use destination type to determine output of cumsum! and
 cumprod!

---
 base/accumulate.jl | 34 ++++++++++++++++++++++++++--------
 base/reduce.jl     |  1 +
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index 20363362f73c7..afe11dd97913b 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -416,15 +416,33 @@ function _accumulate_pairwise_small!(op, dest::AbstractArray{T}, itr, accv, w, i
     end
 end
 
+"""
+    Base.ConvertOp{T}(op)(x,y)
+
+An operator which converts `x` and `y` to type `T` before performing the `op`.
+
+The main purpose is for use in [`cumsum!`](@ref) and [`cumprod!`](@ref), where `T` is determined by the output array.
+"""
+struct ConvertOp{T,O} <: Function
+    op::O
+end
+ConvertOp{T}(op::O) where {T,O} = ConvertOp{T,O}(op)
+(c::ConvertOp{T})(x,y) where {T} = c.op(convert(T,x),convert(T,y))
+reduce_first(c::ConvertOp{T},x) where {T} = reduce_first(c.op, convert(T,x))
+
+
 
 
-function cumsum!(out, v::AbstractVector{T}) where T
+function cumsum!(out::AbstractVector, v::AbstractVector{T}) where T
     # we dispatch on the possibility of numerical accuracy issues
     cumsum!(out, v, ArithmeticStyle(T))
 end
-cumsum!(out, v::AbstractVector, ::ArithmeticRounds)  = accumulate_pairwise!(+, out, v)
-cumsum!(out, v::AbstractVector, ::ArithmeticUnknown) = accumulate_pairwise!(+, out, v)
-cumsum!(out, v::AbstractVector, ::ArithmeticStyle)   = accumulate!(+, out, v)
+cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticRounds) where {T} =
+    accumulate_pairwise!(ConvertOp{T}(+), out, v)
+cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticUnknown) where {T} =
+    accumulate_pairwise!(ConvertOp{T}(+), out, v)
+cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticStyle) where {T} =
+    accumulate!(ConvertOp{T}(+), out, v)
 
 """
     cumsum(A, dim::Integer)
@@ -488,14 +506,14 @@ cumsum(v::AbstractVector, ::ArithmeticStyle)   = accumulate(add_sum, v)
 
 Cumulative sum of `A` along the dimension `dim`, storing the result in `B`. See also [`cumsum`](@ref).
 """
-cumsum!(dest, A, dim::Integer) = accumulate!(+, dest, A, dim)
+cumsum!(dest::AbstractArray{T}, A, dim::Integer) where {T} = accumulate!(ConvertOp{T}(+), dest, A, dim)
 
 """
     cumsum!(y::AbstractVector, x::AbstractVector)
 
 Cumulative sum of a vector `x`, storing the result in `y`. See also [`cumsum`](@ref).
 """
-cumsum!(dest, itr) = accumulate!(+, dest, src)
+cumsum!(dest::AbstractArray{T}, itr) where {T} = accumulate!(ConvertOp{T}(+), dest, src)
 
 """
     cumprod(A, dim::Integer)
@@ -555,7 +573,7 @@ cumprod(x::AbstractVector) = accumulate(mul_prod, x)
 Cumulative product of `A` along the dimension `dim`, storing the result in `B`.
 See also [`cumprod`](@ref).
 """
-cumprod!(dest, A, dim::Integer) = accumulate!(*, dest, A, dim)
+cumprod!(dest::AbstractArray{T}, A, dim::Integer) where {T} = accumulate!(ConvertOp{T}(*), dest, A, dim)
 
 """
     cumprod!(y::AbstractVector, x::AbstractVector)
@@ -563,4 +581,4 @@ cumprod!(dest, A, dim::Integer) = accumulate!(*, dest, A, dim)
 Cumulative product of a vector `x`, storing the result in `y`.
 See also [`cumprod`](@ref).
 """
-cumprod!(dest, itr) = accumulate!(*, dest, itr)
+cumprod!(dest::AbstractArray{T}, itr) where {T} = accumulate!(ConvertOp{T}(*), dest, itr)
diff --git a/base/reduce.jl b/base/reduce.jl
index cb5b8fdf870d6..77e515af5c295 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -28,6 +28,7 @@ add_sum(x::SmallSigned) = Int(x)
 add_sum(x::SmallUnsigned) = UInt(x)
 add_sum(X::AbstractArray) = broadcast(add_sum, X)
 
+
 """
     Base.mul_prod(x,y)
 

From 2ee7db99fed8595585752dd240bbbcbd55b5b26a Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Fri, 23 Feb 2018 17:17:17 -0800
Subject: [PATCH 07/15] rejig pairwise promotion

---
 base/accumulate.jl | 141 ++++++++++++++++++++-------------------------
 test/arrayops.jl   |  14 +++++
 2 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index afe11dd97913b..4255789e5584d 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -320,99 +320,80 @@ accumulate_pairwise(op, itr) = accumulate_pairwise(op, uninitialized, itr)
 accumulate_pairwise(op, v0, itr) =  accumulate_pairwise(op, v0, itr, IteratorSize(itr))
 function accumulate_pairwise(op, v0, itr, ::Union{HasLength,HasShape{1}})
     i = start(itr)
-    if done(itr, i)
+    if done(itr,i)
         return collect(Accumulate(op, v0, itr))
     end
-    v1, i = next(itr, i)
-    accv = reduce_first(op, v0, v1)
-
-    dest = _similar_for(1:1, typeof(accv), itr, IteratorSize(itr))
-    ld = linearindices(dest)
-    j = first(ld)
-    dest[j] = accv
-    m = last(ld) - j
-    if done(itr, i)
-        return dest
+    v1,i = next(itr,i)
+    y = reduce_first(op,v0,v1)
+
+    Y = _similar_for(1:1, typeof(y), itr, IteratorSize(itr))
+    L = linearindices(Y)
+    n = length(L)
+    j = first(L)
+
+    while true
+        Y[j] = y
+        if done(itr,i)
+            return Y
+        end
+        y,j,i,wider = _accum!(op,Y,itr,y,j+1,i,last(L)-j,true)
+        if !wider
+            return Y
+        end
+        R = promote_typejoin(eltype(Y), typeof(y))
+        newY = similar(Y, R)
+        copyto!(newY,1,Y,1,j)
+        Y = newY
     end
-    out, _ = _accumulate_pairwise!(op, dest, itr, accv, i, j, m, true)
-    return out
 end
 
 accumulate_pairwise!(op, dest, itr) = accumulate_pairwise!(op, dest, uninitialized, itr)
-function accumulate_pairwise!(op, dest, v0, itr)
-    linearindices(dest) == linearindices(itr) || throw(DimensionMismatch("length of source and destination must match"))
+function accumulate_pairwise!(op, Y, v0, itr)
+    L = linearindices(Y)
+    L == linearindices(itr) || throw(DimensionMismatch("indices of source and destination must match"))
+
+    n = length(L)
 
     i = start(itr)
-    if done(itr, i)
-        return dest
-    end
-    v1, i = next(itr, i)
-    accv = reduce_first(op, v0, v1)
-
-    ld = linearindices(dest)
-    j = first(ld)
-    dest[j] = accv
-    m = last(ld) - j
-    if done(itr, i)
-        return dest
-    end
-    out, _ = _accumulate_pairwise!(op, dest, itr, accv, i, j, m, false)
-    return out
+    v1,i = next(itr,i)
+
+    j = first(L)
+    Y[j] = y = reduce_first(op,v0,v1)
+    _accum!(op,Y,itr,y,j+1,i,n-1,false)
+    return Y
 end
 
-# TODO figure out promotion machinery
-# collect_pairwise
-function _accumulate_pairwise!(op, dest::AbstractArray{T}, itr, accv, i, j, m, widen) where {T}
-    if m < 128
-        # m >= 1
-        w, i = next(itr, i)
-        y = op(accv, w)
-        S = typeof(y)
-        if !widen || S === T || S<:T
-            @inbounds dest[j+=1] = y
-            return _accumulate_pairwise_small!(op, dest, itr, accv, w, i, j, m-1, widen)
-        else
-            R = promote_typejoin(T, S)
-            new = similar(dest, R)
-            copyto!(new,1,dest,1,j)
-            @inbounds new[j+=1] = y
-            return _accumulate_pairwise_small!(op, new, itr, accv, w, i, j, m-1, widen)
+function _accumulate_pairwise!(op,Y,X,y0,j,i,m,widen)
+    if m < 128 # m >= 1
+        @inbounds begin
+            x,i = next(X,i)
+            y = op(y0, x)
+            if widen && !isa(y, eltype(Y))
+                return y, j, i, true
+            end
+            Y[j] = y
+
+            # to keep type stability, could also unroll loop?
+            w = reduce_first(op, x)
+            for k = 1:m-1
+                j += 1
+                x,i = next(X,i)
+                w = op(w,  x)
+                y = op(y0, w)
+                if widen && !isa(y, eltype(Y))
+                    return y, j, i, true
+                end
+                Y[j] = y
+            end
+            return w, j+1, i, false
         end
     else
         m1 = m >> 1
-        m2 = m - m1
-        dest, taccv1, i, j = _accumulate_pairwise!(op, dest, itr, accv, i, j, m1, widen)
-        dest, taccv2, i, j = _accumulate_pairwise!(op, dest, itr, op(accv, taccv1), i, j, m2, widen)
-        taccv = op(taccv1, taccv2)
-    end
-    return dest, taccv, i, j
-end
-
-function _accumulate_pairwise_small!(op, dest::AbstractArray{T}, itr, accv, w, i, j, m, widen) where T
-    if m == 0
-        return dest, reduce_first(op, w), i, j
-    end
-    v, i = next(itr, i)
-    x = op(w, v)
-    while true
-        y = op(accv, x)
-        S = typeof(y)
-        if !widen || S === T || S<:T
-            @inbounds dest[j+=1] = y
-            m -= 1
-        else
-            R = promote_typejoin(T, S)
-            new = similar(dest, R)
-            copyto!(new,1,dest,1,j)
-            @inbounds new[j+=1] = y
-            return _accumulate_pairwise_small!(op, new, itr, accv, x, i, j, m-1, widen)
-        end
-        if m == 0
-            return dest, x, i, j
-        end
-
-        v, i = next(itr, i)
-        x = op(x, v)
+        v1, j, i, wider = _accum!(op,Y,X,y0,j,i,m1,widen)
+        wider && return v1, j, i, wider
+        v2, j, i, wider = _accum!(op,Y,X,op(y0,v1),j,i,m-m1,widen)
+        wider && return v1, j, i, wider
+        return op(v1, v2), j, i, false
     end
 end
 
diff --git a/test/arrayops.jl b/test/arrayops.jl
index c10b3c49ca7ed..763b7bbfbda28 100644
--- a/test/arrayops.jl
+++ b/test/arrayops.jl
@@ -2309,3 +2309,17 @@ end
     @test accumulate(*, ['a' 'b'; 'c' 'd'], 1) == ["a" "b"; "ac" "bd"]
     @test accumulate(*, ['a' 'b'; 'c' 'd'], 2) == ["a" "ab"; "c" "cd"]
 end
+
+@testset "accumulate promotion" begin
+    U = cumsum(Any[1,1//1,1f0,1.0,big(1.0)])
+    @test U[1] == 1
+    @test U[1] isa Int
+    @test U[2] == 2
+    @test U[2] isa Rational{Int}
+    @test U[3] == 3
+    @test U[3] isa Float32
+    @test U[4] == 4
+    @test U[4] isa Float64
+    @test U[5] == 5
+    @test U[5] isa BigFloat
+end

From 3594346f1a7e4b75227e752c2cb31e35b800c549 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Sun, 25 Feb 2018 14:48:33 -0800
Subject: [PATCH 08/15] fix up dims

---
 base/accumulate.jl | 337 ++++++++++++++++++++++-----------------------
 test/arrayops.jl   |   4 +-
 2 files changed, 164 insertions(+), 177 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index 4255789e5584d..627c1b7df22d1 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -45,10 +45,10 @@ end
 
 
 """
-    accumulate(op[, v0], itr)
+    accumulate(op[, v0], itr; dims::Integer)
 
-Cumulative apply binary operation `op` on an iterable `itr`. If an initial `v0` value is
-provided, then the first value of the iterator will be
+Cumulative apply binary operation `op` on an iterable `itr`, along the dimension
+`dims`. If an initial `v0` value is provided, then the first value of the iterator will be
 
     op(v0, first(iter))
 
@@ -56,8 +56,11 @@ Otherwise it will use
 
     Base.reduce_first(op, first(iter))
 
-See also: [`accumulate!`](@ref) to use a preallocated output array, and [`cumsum`](@ref),
-[`cumprod`](@ref) for specialized versions.
+The keyword argument `dims` is optional for 1-dimensional iterators.
+
+See also:
+ - [`accumulate!`](@ref) to use a preallocated output array,
+ - [`cumsum`](@ref) and [`cumprod`](@ref) for specialized versions.
 
 # Examples
 ```jldoctest
@@ -84,19 +87,37 @@ julia> accumulate(min, 0, [1,2,-1])
   0
   0
  -1
+
+julia> accumulate(+, fill(1, 3, 3); dims=1)
+3×3 Array{Int64,2}:
+ 1  1  1
+ 2  2  2
+ 3  3  3
+
+julia> accumulate(+, fill(1, 3, 3); dims=2)
+3×3 Array{Int64,2}:
+ 1  2  3
+ 1  2  3
+ 1  2  3
 ```
 """
-accumulate(op, itr) = accumulate(op, uninitialized, itr)
-accumulate(op, v0, itr) = accumulate(op, v0, itr, IteratorSize(itr))
-accumulate(op, v0, itr, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}) =
+accumulate(op, itr; dims::Union{Nothing,Integer}=nothing) = accumulate(op, uninitialized, itr; dims=dims)
+accumulate(op, v0, itr; dims::Union{Nothing,Integer}=nothing) = _accumulate(op, v0, itr, IteratorSize(itr), dims)
+
+function _accumulate(op, v0, itr, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}, ::Nothing)
     collect(Accumulate(op, v0, itr))
+end
 
 
 """
-    accumulate!(op, dest,[, v0] itr)
+    accumulate!(op, dest[, v0], itr; dims::Integer)
+
+Cumulative operation `op` on an iterator `itr`, storing the result in `dest`, along the
+dimension `dims` (optional for 1-dimensional iterators).
 
-Cumulative operation `op` on an iterator `itr`, storing the result in `dest`.
-See also [`accumulate`](@ref).
+See also:
+ - [`accumulate`](@ref) for the non-inplace version
+ - [`cumsum!`](@ref) and [`cumprod!`](@ref)
 
 # Examples
 ``jldoctest
@@ -113,10 +134,30 @@ julia> y
  3
  3
  6
+
+julia> A = [1 2; 3 4];
+
+julia> B = [0 0; 0 0];
+
+julia> accumulate!(B, -, A; dims=1);
+
+julia> B
+2×2 Array{Int64,2}:
+  1   2
+ -2  -2
+
+julia> accumulate!(-, B, A; dims=2);
+
+julia> B
+2×2 Array{Int64,2}:
+ 1  -1
+ 3  -1
 ```
 """
-accumulate!(op, dest, x) = accumulate!(op, dest, uninitialized, x)
-function accumulate!(op, dest, v0, x)
+accumulate!(op, dest, itr; dims::Union{Nothing,Integer}=nothing) = accumulate!(op, dest, uninitialized, itr; dims=dims)
+accumulate!(op, dest, v0, itr; dims::Union{Nothing,Integer}=nothing) = _accumulate!(op, dest, uninitialized, itr, IteratorSize(itr), dims)
+
+function _accumulate!(op, dest, v0, x, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}, ::Nothing)
     src = Accumulate(op, v0, x)
 
     # this is essentially `copyto!`, but unrolled to deal with potential type instability in first state
@@ -148,31 +189,7 @@ function accumulate!(op, dest, v0, x)
 end
 
 
-"""
-    accumulate(op[, v0], A, dim::Integer)
-
-Cumulative operation `op` along the dimension `dim`. See also
-[`accumulate!`](@ref) to use a preallocated output array. For common operations
-there are specialized variants of `accumulate`, see:
-[`cumsum`](@ref), [`cumprod`](@ref)
-
-# Examples
-```jldoctest
-julia> accumulate(+, fill(1, 3, 3), 1)
-3×3 Array{Int64,2}:
- 1  1  1
- 2  2  2
- 3  3  3
-
-julia> accumulate(+, fill(1, 3, 3), 2)
-3×3 Array{Int64,2}:
- 1  2  3
- 1  2  3
- 1  2  3
-```
-"""
-accumulate(op, X, dim::Int) = accumulate(op, uninitialized, X, dim)
-
+# by dimension accumulate/accumulate!
 # split into (head, slice, tail) dimension iterators
 @inline function split_dimensions(X,dim::Integer)
     inds = axes(X)
@@ -183,7 +200,7 @@ accumulate(op, X, dim::Int) = accumulate(op, uninitialized, X, dim)
     end
 end
 
-function accumulate(op, v0, X, dim::Integer)
+function _accumulate(op, v0, X, ::IteratorSize, dim::Integer)
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
     if length(X) == 0
         # fallback on collect machinery
@@ -215,37 +232,7 @@ function accumulate(op, v0, X, dim::Integer)
     return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
 end
 
-
-"""
-    accumulate!(op, dest, A, dim::Integer)
-
-Cumulative operation `op` on `A` along the dimension `dim`, storing the result in `B`.
-See also [`accumulate`](@ref).
-
-# Examples
-```jldoctest
-julia> A = [1 2; 3 4];
-
-julia> B = [0 0; 0 0];
-
-julia> accumulate!(B, -, A, 1);
-
-julia> B
-2×2 Array{Int64,2}:
-  1   2
- -2  -2
-
-julia> accumulate!(-, B, A, 2);
-
-julia> B
-2×2 Array{Int64,2}:
- 1  -1
- 3  -1
-```
-"""
-accumulate!(op, dest, X, dim::Integer) = accumulate!(op, dest, uninitialized, X, dim::Integer)
-
-function accumulate!(op, dest, v0, X, dim::Integer)
+function _accumulate!(op, dest, v0, X, ::IteratorSize, dim::Integer)
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
     axes(dest) == axes(X) || throw(DimensionMismatch("shape of source and destination must match"))
     indH, indD, indT = split_dimensions(X,dim)
@@ -306,6 +293,8 @@ function _accumulate!(op, dest::AbstractArray{T}, i, v0, X, indH, indD, indT, sH
     end
 end
 
+
+
 """
     accumulate_pairwise(op[, v0], itr)
 
@@ -336,7 +325,7 @@ function accumulate_pairwise(op, v0, itr, ::Union{HasLength,HasShape{1}})
         if done(itr,i)
             return Y
         end
-        y,j,i,wider = _accum!(op,Y,itr,y,j+1,i,last(L)-j,true)
+        y,j,i,wider = _accumulate_pairwise!(op,Y,itr,y,j+1,i,last(L)-j,true)
         if !wider
             return Y
         end
@@ -359,7 +348,7 @@ function accumulate_pairwise!(op, Y, v0, itr)
 
     j = first(L)
     Y[j] = y = reduce_first(op,v0,v1)
-    _accum!(op,Y,itr,y,j+1,i,n-1,false)
+    _accumulate_pairwise!(op,Y,itr,y,j+1,i,n-1,false)
     return Y
 end
 
@@ -389,145 +378,125 @@ function _accumulate_pairwise!(op,Y,X,y0,j,i,m,widen)
         end
     else
         m1 = m >> 1
-        v1, j, i, wider = _accum!(op,Y,X,y0,j,i,m1,widen)
+        v1, j, i, wider = _accumulate_pairwise!(op,Y,X,y0,j,i,m1,widen)
         wider && return v1, j, i, wider
-        v2, j, i, wider = _accum!(op,Y,X,op(y0,v1),j,i,m-m1,widen)
+        v2, j, i, wider = _accumulate_pairwise!(op,Y,X,op(y0,v1),j,i,m-m1,widen)
         wider && return v1, j, i, wider
         return op(v1, v2), j, i, false
     end
 end
 
-"""
-    Base.ConvertOp{T}(op)(x,y)
 
-An operator which converts `x` and `y` to type `T` before performing the `op`.
 
-The main purpose is for use in [`cumsum!`](@ref) and [`cumprod!`](@ref), where `T` is determined by the output array.
 """
-struct ConvertOp{T,O} <: Function
-    op::O
-end
-ConvertOp{T}(op::O) where {T,O} = ConvertOp{T,O}(op)
-(c::ConvertOp{T})(x,y) where {T} = c.op(convert(T,x),convert(T,y))
-reduce_first(c::ConvertOp{T},x) where {T} = reduce_first(c.op, convert(T,x))
-
-
-
-
-function cumsum!(out::AbstractVector, v::AbstractVector{T}) where T
-    # we dispatch on the possibility of numerical accuracy issues
-    cumsum!(out, v, ArithmeticStyle(T))
-end
-cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticRounds) where {T} =
-    accumulate_pairwise!(ConvertOp{T}(+), out, v)
-cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticUnknown) where {T} =
-    accumulate_pairwise!(ConvertOp{T}(+), out, v)
-cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticStyle) where {T} =
-    accumulate!(ConvertOp{T}(+), out, v)
+    cumsum(A; dims::Integer)
 
-"""
-    cumsum(A, dim::Integer)
+Cumulative sum `A` along the dimension `dims` (`dims` is option for 1-dimensional
+objects). This has the same promotion behaviour as [`sum`](@ref), namely that lower-width
+integer types are promoted to `Int`/`UInt`.
 
-Cumulative sum along the dimension `dim`. See also [`cumsum!`](@ref)
-to use a preallocated output array, both for performance and to control the precision of the
-output (e.g. to avoid overflow).
+See also:
+ - [`cumsum!`](@ref) to use a preallocated output array, both for performance and
+to control the precision of the output (e.g. to avoid overflow).
+ - [`accumulate`](@ref) for accumulation using other operators.
 
 # Examples
 ```jldoctest
+julia> cumsum([1, 1, 1])
+3-element Array{Int64,1}:
+ 1
+ 2
+ 3
+
+julia> cumsum([fill(1, 2) for i in 1:3])
+3-element Array{Array{Int64,1},1}:
+ [1, 1]
+ [2, 2]
+ [3, 3]
+
 julia> a = [1 2 3; 4 5 6]
 2×3 Array{Int64,2}:
  1  2  3
  4  5  6
 
-julia> cumsum(a,1)
+julia> cumsum(a; dims=1)
 2×3 Array{Int64,2}:
  1  2  3
  5  7  9
 
-julia> cumsum(a,2)
+julia> cumsum(a; dims=2)
 2×3 Array{Int64,2}:
  1  3   6
  4  9  15
 ```
 """
-cumsum(A, dim::Integer) = accumulate(add_sum, A, dim)
-
-"""
-    cumsum(x::AbstractVector)
-
-Cumulative sum a vector. See also [`cumsum!`](@ref)
-to use a preallocated output array, both for performance and to control the precision of the
-output (e.g. to avoid overflow).
-
-# Examples
-```jldoctest
-julia> cumsum([1, 1, 1])
-3-element Array{Int64,1}:
- 1
- 2
- 3
+function cumsum(A; dims::Union{Nothing,Integer}=nothing)
+    if A isa AbstractArray && ndims(A) > 1 && dims === nothing
+        depwarn("`cumsum(A::AbstractArray)` is deprecated, use `cumsum(A; dims=1)` instead.", :cumsum)
+        dims = 1
+    end
+    accumulate(add_sum, A; dims=dims)
+end
 
-julia> cumsum([fill(1, 2) for i in 1:3])
-3-element Array{Array{Int64,1},1}:
- [1, 1]
- [2, 2]
- [3, 3]
-```
-"""
 function cumsum(v::AbstractVector{T}) where T
     # we dispatch on the possibility of numerical accuracy issues
-    cumsum(v, ArithmeticStyle(T))
+    _cumsum(v, ArithmeticStyle(T))
 end
-cumsum(v::AbstractVector, ::ArithmeticRounds)  = accumulate_pairwise(add_sum, v)
-cumsum(v::AbstractVector, ::ArithmeticUnknown) = accumulate_pairwise(add_sum, v)
-cumsum(v::AbstractVector, ::ArithmeticStyle)   = accumulate(add_sum, v)
+_cumsum(v::AbstractVector, ::ArithmeticRounds)  = accumulate_pairwise(add_sum, v)
+_cumsum(v::AbstractVector, ::ArithmeticUnknown) = accumulate_pairwise(add_sum, v)
+_cumsum(v::AbstractVector, ::ArithmeticStyle)   = accumulate(add_sum, v)
 
-"""
-    cumsum!(B, A, dim::Integer)
 
-Cumulative sum of `A` along the dimension `dim`, storing the result in `B`. See also [`cumsum`](@ref).
 """
-cumsum!(dest::AbstractArray{T}, A, dim::Integer) where {T} = accumulate!(ConvertOp{T}(+), dest, A, dim)
+    Base.ConvertOp{T}(op)(x,y)
 
-"""
-    cumsum!(y::AbstractVector, x::AbstractVector)
+An operator which converts `x` and `y` to type `T` before performing the `op`.
 
-Cumulative sum of a vector `x`, storing the result in `y`. See also [`cumsum`](@ref).
+The main purpose is for use in [`cumsum!`](@ref) and [`cumprod!`](@ref), where `T` is determined by the output array.
 """
-cumsum!(dest::AbstractArray{T}, itr) where {T} = accumulate!(ConvertOp{T}(+), dest, src)
+struct ConvertOp{T,O} <: Function
+    op::O
+end
+ConvertOp{T}(op::O) where {T,O} = ConvertOp{T,O}(op)
+(c::ConvertOp{T})(x,y) where {T} = c.op(convert(T,x),convert(T,y))
+reduce_first(c::ConvertOp{T},x) where {T} = reduce_first(c.op, convert(T,x))
 
-"""
-    cumprod(A, dim::Integer)
 
-Cumulative product along the dimension `dim`. See also
-[`cumprod!`](@ref) to use a preallocated output array, both for performance and
-to control the precision of the output (e.g. to avoid overflow).
 
-# Examples
-```jldoctest
-julia> a = [1 2 3; 4 5 6]
-2×3 Array{Int64,2}:
- 1  2  3
- 4  5  6
 
-julia> cumprod(a,1)
-2×3 Array{Int64,2}:
- 1   2   3
- 4  10  18
+"""
+    cumsum!(B, A; dims::Integer)
 
-julia> cumprod(a,2)
-2×3 Array{Int64,2}:
- 1   2    6
- 4  20  120
-```
+Cumulative sum of `A` along the dimension `dims`, storing the result in `B`. `dims` is
+optional for 1-dimensional objects.
+
+Elements of `A` are first converted to the element type of `B` before addition, so as to
+improve accuracy and reduce occurences of arithemtic overflow.
+
+See also [`cumsum`](@ref).
 """
-cumprod(A, dim::Integer) = accumulate(mul_prod, A, dim)
+cumsum!(dest::AbstractArray{T}, A; dims::Union{Nothing,Integer}=nothing) where {T} =
+    accumulate!(ConvertOp{T}(+), dest, A; dims=dims)
+
+function cumsum!(out::AbstractVector, v::AbstractVector{T}) where T
+    # we dispatch on the possibility of numerical accuracy issues
+    cumsum!(out, v, ArithmeticStyle(T))
+end
+cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticRounds) where {T} =
+    accumulate_pairwise!(ConvertOp{T}(+), out, v)
+cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticUnknown) where {T} =
+    accumulate_pairwise!(ConvertOp{T}(+), out, v)
+cumsum!(out::AbstractVector{T}, v::AbstractVector, ::ArithmeticStyle) where {T} =
+    accumulate!(ConvertOp{T}(+), out, v)
+
 
 """
-    cumprod(x::AbstractVector)
+    cumprod(A; dims::Integer)
+
+Cumulative product along the dimension `dims` (`dims` is optional for 1-dimensional objects).
 
-Cumulative product of a vector. See also
-[`cumprod!`](@ref) to use a preallocated output array, both for performance and
+See also:
+ - [`cumprod!`](@ref) to use a preallocated output array, both for performance and
 to control the precision of the output (e.g. to avoid overflow).
 
 # Examples
@@ -543,23 +512,41 @@ julia> cumprod([fill(1//3, 2, 2) for i in 1:3])
  Rational{Int64}[1//3 1//3; 1//3 1//3]
  Rational{Int64}[2//9 2//9; 2//9 2//9]
  Rational{Int64}[4//27 4//27; 4//27 4//27]
+
+julia> a = [1 2 3; 4 5 6]
+2×3 Array{Int64,2}:
+ 1  2  3
+ 4  5  6
+
+julia> cumprod(a; dims=1)
+2×3 Array{Int64,2}:
+ 1   2   3
+ 4  10  18
+
+julia> cumprod(a; dims=2)
+2×3 Array{Int64,2}:
+ 1   2    6
+ 4  20  120
 ```
 """
-cumprod(x) = accumulate(mul_prod, x)
-cumprod(x::AbstractVector) = accumulate(mul_prod, x)
+function cumprod(A; dims::Union{Nothing,Integer}=nothing)
+    if A isa AbstractArray && ndims(A) > 1 && dims === nothing
+        depwarn("`cumprod(A::AbstractArray)` is deprecated, use `cumprod(A; dims=1)` instead.", :cumprod)
+        dims = 1
+    end
+    accumulate(mul_prod, A; dims=dims)
+end
 
 """
-    cumprod!(B, A, dim::Integer)
+    cumprod!(B, A; dims::Integer)
 
-Cumulative product of `A` along the dimension `dim`, storing the result in `B`.
-See also [`cumprod`](@ref).
-"""
-cumprod!(dest::AbstractArray{T}, A, dim::Integer) where {T} = accumulate!(ConvertOp{T}(*), dest, A, dim)
+Cumulative product of `A` along the dimension `dims`, storing the result in `B`. `dims` is
+optional for 1-dimensional objects.
 
-"""
-    cumprod!(y::AbstractVector, x::AbstractVector)
+Elements of `A` are first converted to the element type of `B` before multiplication, so
+as to improve accuracy and reduce occurences of arithemtic overflow.
 
-Cumulative product of a vector `x`, storing the result in `y`.
 See also [`cumprod`](@ref).
 """
-cumprod!(dest::AbstractArray{T}, itr) where {T} = accumulate!(ConvertOp{T}(*), dest, itr)
+cumprod!(dest::AbstractArray{T}, A; dims::Union{Nothing,Integer}=nothing) where {T} =
+    accumulate!(ConvertOp{T}(*), dest, A; dims=dims)
diff --git a/test/arrayops.jl b/test/arrayops.jl
index 4b0653fc791bd..fc8a6805751d1 100644
--- a/test/arrayops.jl
+++ b/test/arrayops.jl
@@ -2306,8 +2306,8 @@ end
     @test accumulate((acc, x) -> acc+x[1], 0, [(1,2), (3,4), (5,6)]) == [1, 4, 9]
     @test accumulate(*, ['a', 'b']) == ["a", "ab"]
     @inferred accumulate(*, String[])
-    @test accumulate(*, ['a' 'b'; 'c' 'd'], 1) == ["a" "b"; "ac" "bd"]
-    @test accumulate(*, ['a' 'b'; 'c' 'd'], 2) == ["a" "ab"; "c" "cd"]
+    @test accumulate(*, ['a' 'b'; 'c' 'd'], dims=1) == ["a" "b"; "ac" "bd"]
+    @test accumulate(*, ['a' 'b'; 'c' 'd'], dims=2) == ["a" "ab"; "c" "cd"]
 end
 
 @testset "accumulate promotion" begin

From cd5d3ae31fbbcb0ae9eeb23b809897c274c8abe5 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Tue, 27 Feb 2018 11:59:12 -0800
Subject: [PATCH 09/15] fix inplace accumulate

---
 base/accumulate.jl  | 8 ++++++--
 test/offsetarray.jl | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index 627c1b7df22d1..5a1337d6dd84a 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -202,7 +202,7 @@ end
 
 function _accumulate(op, v0, X, ::IteratorSize, dim::Integer)
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
-    if length(X) == 0
+    if isempty(X)
         # fallback on collect machinery
         return collect(Accumulate(op, v0, X))
     end
@@ -250,7 +250,11 @@ function _accumulate!(op, dest, v0, X, ::IteratorSize, dim::Integer)
     @assert !done(indT, sT)
     iT,sT = next(indT, sT)
 
-    return _accumulate!(op, dest, first(linearindices(dest))-1, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, false)
+    accv = reduce_first(op, v0, X[iH, iD, iT])
+    i = first(linearindices(dest))
+    dest[i] = accv
+
+    return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, false)
 end
 
 function _accumulate!(op, dest::AbstractArray{T}, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, widen=true) where {T}
diff --git a/test/offsetarray.jl b/test/offsetarray.jl
index 42e05f1d4c252..ac5f9f90e7a67 100644
--- a/test/offsetarray.jl
+++ b/test/offsetarray.jl
@@ -345,8 +345,10 @@ C = similar(A)
 cumsum!(C, A, dims=1)
 @test parent(C) == cumsum(parent(A), dims=1)
 @test parent(cumsum(A, dims=1)) == cumsum(parent(A), dims=1)
+@test C == cumsum(A, dims=1)
 cumsum!(C, A, dims=2)
 @test parent(C) == cumsum(parent(A), dims=2)
+@test C == cumsum(A, dims=2)
 R = similar(A, (1:1, 6:9))
 maximum!(R, A)
 @test parent(R) == maximum(parent(A), dims=1)

From 5a84b98104356ea499ad755796e40cee16ed5d85 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Tue, 27 Feb 2018 22:53:12 -0800
Subject: [PATCH 10/15] fix SparseArrays promotion bug

---
 stdlib/SparseArrays/src/sparsematrix.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stdlib/SparseArrays/src/sparsematrix.jl b/stdlib/SparseArrays/src/sparsematrix.jl
index f4b8a8a549e34..4f47d0e2a82b0 100644
--- a/stdlib/SparseArrays/src/sparsematrix.jl
+++ b/stdlib/SparseArrays/src/sparsematrix.jl
@@ -2285,7 +2285,7 @@ function getindex(A::SparseMatrixCSC{Tv,Ti}, I::AbstractArray) where {Tv,Ti}
             end
         end
     end
-    colptrB = cumsum(colptrB)
+    colptrB = accumulate(+,colptrB) # cumsum would promote incorrectly
     if n > (idxB-1)
         deleteat!(nzvalB, idxB:n)
         deleteat!(rowvalB, idxB:n)

From 50ace75b2a18e71a91a7e6c77ac31b0a9e32ac81 Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Fri, 16 Mar 2018 10:23:17 -0700
Subject: [PATCH 11/15] fix deprecation warning

---
 base/accumulate.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index dc7a2c350a6ae..c467cccdbc431 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -194,9 +194,9 @@ end
 @inline function split_dimensions(X,dim::Integer)
     inds = axes(X)
     if dim > length(inds)
-        (CartesianIndices(inds), CartesianIndices(), CartesianIndices())
+        (CartesianIndices(inds), CartesianIndices(()), CartesianIndices(()))
     else
-        (CartesianIndices(inds[1:dim-1]), inds[dim], CartesianIndices(inds[dim+1:end]))
+        (CartesianIndices(inds[1:dim-1]), CartesianIndices((inds[dim],)), CartesianIndices(inds[dim+1:end]))
     end
 end
 

From e47a713f83b280c6b4fbb220d79abd5dea44b62c Mon Sep 17 00:00:00 2001
From: Martin Holters <martin.holters@hsu-hh.de>
Date: Wed, 21 Mar 2018 09:47:13 +0100
Subject: [PATCH 12/15] Force specialization on `op` (or inlining) in
 `accumulate` machinery

---
 base/accumulate.jl | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index c467cccdbc431..4aa4b8122ea21 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -101,10 +101,10 @@ julia> accumulate(+, fill(1, 3, 3); dims=2)
  1  2  3
 ```
 """
-accumulate(op, itr; dims::Union{Nothing,Integer}=nothing) = accumulate(op, undef, itr; dims=dims)
-accumulate(op, v0, itr; dims::Union{Nothing,Integer}=nothing) = _accumulate(op, v0, itr, IteratorSize(itr), dims)
+@inline accumulate(op, itr; dims::Union{Nothing,Integer}=nothing) = accumulate(op, undef, itr; dims=dims)
+@inline accumulate(op, v0, itr; dims::Union{Nothing,Integer}=nothing) = _accumulate(op, v0, itr, IteratorSize(itr), dims)
 
-function _accumulate(op, v0, itr, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}, ::Nothing)
+@inline function _accumulate(op, v0, itr, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}, ::Nothing)
     collect(Accumulate(op, v0, itr))
 end
 
@@ -154,10 +154,10 @@ julia> B
  3  -1
 ```
 """
-accumulate!(op, dest, itr; dims::Union{Nothing,Integer}=nothing) = accumulate!(op, dest, undef, itr; dims=dims)
-accumulate!(op, dest, v0, itr; dims::Union{Nothing,Integer}=nothing) = _accumulate!(op, dest, undef, itr, IteratorSize(itr), dims)
+@inline accumulate!(op, dest, itr; dims::Union{Nothing,Integer}=nothing) = accumulate!(op, dest, undef, itr; dims=dims)
+@inline accumulate!(op, dest, v0, itr; dims::Union{Nothing,Integer}=nothing) = _accumulate!(op, dest, undef, itr, IteratorSize(itr), dims)
 
-function _accumulate!(op, dest, v0, x, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}, ::Nothing)
+function _accumulate!(op::F, dest, v0, x, ::Union{SizeUnknown,HasLength,IsInfinite,HasShape{1}}, ::Nothing) where F
     src = Accumulate(op, v0, x)
 
     # this is essentially `copyto!`, but unrolled to deal with potential type instability in first state
@@ -200,7 +200,7 @@ end
     end
 end
 
-function _accumulate(op, v0, X, ::IteratorSize, dim::Integer)
+function _accumulate(op::F, v0, X, ::IteratorSize, dim::Integer) where F
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
     if isempty(X)
         # fallback on collect machinery
@@ -232,7 +232,7 @@ function _accumulate(op, v0, X, ::IteratorSize, dim::Integer)
     return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
 end
 
-function _accumulate!(op, dest, v0, X, ::IteratorSize, dim::Integer)
+function _accumulate!(op::F, dest, v0, X, ::IteratorSize, dim::Integer) where F
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
     axes(dest) == axes(X) || throw(DimensionMismatch("shape of source and destination must match"))
     indH, indD, indT = split_dimensions(X,dim)
@@ -257,7 +257,7 @@ function _accumulate!(op, dest, v0, X, ::IteratorSize, dim::Integer)
     return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, false)
 end
 
-function _accumulate!(op, dest::AbstractArray{T}, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, widen=true) where {T}
+function _accumulate!(op::F, dest::AbstractArray{T}, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, widen=true) where {F,T}
     while true
         if done(indH,sH)
             if done(indD, sD)
@@ -309,9 +309,9 @@ approach, which can be more numerically accurate for certain operations, such as
 It involves roughly double the number of `op` calls, but for cheap operations like `+`
 this does not have much impact (approximately 20%).
 """
-accumulate_pairwise(op, itr) = accumulate_pairwise(op, undef, itr)
-accumulate_pairwise(op, v0, itr) =  accumulate_pairwise(op, v0, itr, IteratorSize(itr))
-function accumulate_pairwise(op, v0, itr, ::Union{HasLength,HasShape{1}})
+@inline accumulate_pairwise(op, itr) = accumulate_pairwise(op, undef, itr)
+@inline accumulate_pairwise(op, v0, itr) =  accumulate_pairwise(op, v0, itr, IteratorSize(itr))
+function accumulate_pairwise(op::F, v0, itr, ::Union{HasLength,HasShape{1}}) where F
     i = start(itr)
     if done(itr,i)
         return collect(Accumulate(op, v0, itr))
@@ -340,8 +340,8 @@ function accumulate_pairwise(op, v0, itr, ::Union{HasLength,HasShape{1}})
     end
 end
 
-accumulate_pairwise!(op, dest, itr) = accumulate_pairwise!(op, dest, undef, itr)
-function accumulate_pairwise!(op, Y, v0, itr)
+@inline accumulate_pairwise!(op, dest, itr) = accumulate_pairwise!(op, dest, undef, itr)
+function accumulate_pairwise!(op::F, Y, v0, itr) where F
     L = linearindices(Y)
     L == linearindices(itr) || throw(DimensionMismatch("indices of source and destination must match"))
 
@@ -356,7 +356,7 @@ function accumulate_pairwise!(op, Y, v0, itr)
     return Y
 end
 
-function _accumulate_pairwise!(op,Y,X,y0,j,i,m,widen)
+function _accumulate_pairwise!(op::F,Y,X,y0,j,i,m,widen) where F
     if m < 128 # m >= 1
         @inbounds begin
             x,i = next(X,i)

From 9b7b8fbcb781a1b3a2a351fd259cc2b85461fac6 Mon Sep 17 00:00:00 2001
From: Martin Holters <martin.holters@hsu-hh.de>
Date: Thu, 22 Mar 2018 11:35:12 +0100
Subject: [PATCH 13/15] Make `accumulate(..., dims=dim)` type-stable (if
 possible)

---
 base/accumulate.jl | 121 +++++++++++----------------------------------
 1 file changed, 30 insertions(+), 91 deletions(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index 4aa4b8122ea21..5867cda14e23f 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -188,113 +188,52 @@ function _accumulate!(op::F, dest, v0, x, ::Union{SizeUnknown,HasLength,IsInfini
     end
 end
 
-
-# by dimension accumulate/accumulate!
-# split into (head, slice, tail) dimension iterators
-@inline function split_dimensions(X,dim::Integer)
-    inds = axes(X)
-    if dim > length(inds)
-        (CartesianIndices(inds), CartesianIndices(()), CartesianIndices(()))
-    else
-        (CartesianIndices(inds[1:dim-1]), CartesianIndices((inds[dim],)), CartesianIndices(inds[dim+1:end]))
-    end
-end
-
-function _accumulate(op::F, v0, X, ::IteratorSize, dim::Integer) where F
+function _accumulate(op::F, v0, X, ::HasShape{N}, dim::Integer) where {F,N}
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
     if isempty(X)
         # fallback on collect machinery
         return collect(Accumulate(op, v0, X))
     end
-    indH, indD, indT = split_dimensions(X,dim)
-
-    # TODO: this could be much nicer with the new iteration protocol
-    # unroll loops to get first element
-    sH = start(indH)
-    @assert !done(indH, sH)
-    iH,sH = next(indH, sH)
-
-    sD = start(indD)
-    @assert !done(indD, sD)
-    iD,sD = next(indD, sD)
-    pD = iD
-
-    sT = start(indT)
-    @assert !done(indT, sT)
-    iT,sT = next(indT, sT)
-
-    # first element
-    accv = reduce_first(op, v0, X[iH, iD, iT])
-    dest = similar(X, typeof(accv))
-    i = first(linearindices(dest))
-    dest[i] = accv
-
-    return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
+    inds = CartesianIndices(axes(X))
+    first_in_dim = first(axes(X, dim))
+    dim_delta = CartesianIndex(ntuple(d -> d==dim ? 1 : 0, Val{N}()))
+    st = start(inds)
+    i, st = next(inds, st)
+    v = reduce_first(op, v0, X[i])
+    dest = similar(X, typeof(v))
+    dest[i] = v
+    return _accumulate!(op, dest, v0, X, dim, inds, st, first_in_dim, dim_delta, true)
 end
 
-function _accumulate!(op::F, dest, v0, X, ::IteratorSize, dim::Integer) where F
+function _accumulate!(op::F, dest, v0, X, ::HasShape{N}, dim::Integer) where {F,N}
     dim > 0 || throw(ArgumentError("dim must be a positive integer"))
     axes(dest) == axes(X) || throw(DimensionMismatch("shape of source and destination must match"))
-    indH, indD, indT = split_dimensions(X,dim)
-
-    sH = start(indH)
-    @assert !done(indH, sH)
-    iH,sH = next(indH, sH)
-
-    sD = start(indD)
-    @assert !done(indD, sD)
-    iD,sD = next(indD, sD)
-    pD = iD
-
-    sT = start(indT)
-    @assert !done(indT, sT)
-    iT,sT = next(indT, sT)
-
-    accv = reduce_first(op, v0, X[iH, iD, iT])
-    i = first(linearindices(dest))
-    dest[i] = accv
-
-    return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, false)
+    inds = CartesianIndices(axes(X))
+    first_in_dim = first(axes(X, dim))
+    dim_delta = CartesianIndex(ntuple(d -> d==dim ? 1 : 0, Val{N}()))
+    st = start(inds)
+    return _accumulate!(op, dest, v0, X, dim, inds, st, first_in_dim, dim_delta, false)
 end
 
-function _accumulate!(op::F, dest::AbstractArray{T}, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD, widen=true) where {F,T}
-    while true
-        if done(indH,sH)
-            if done(indD, sD)
-                if done(indT,sT)
-                    return dest
-                else
-                    iT, sT = next(indT, sT)
-                end
-                iD, sD = next(indD, start(indD))
-                pD = iD
-            else
-                pD = iD
-                iD, sD = next(indD, sD)
-            end
-            iH, sH = next(indH, start(indH))
-        else
-            iH, sH = next(indH, sH)
-        end
-
-        if iD == pD
-            accv = reduce_first(op, v0, X[iH, iD, iT])
+function _accumulate!(op::F, dest::AbstractArray{T}, v0, X, dim, inds, st, first_in_dim, dim_delta, widen) where {F,T}
+    while !done(inds, st)
+        i, st = next(inds, st)
+        if dim > length(i) || i[dim] == first_in_dim
+            v = reduce_first(op, v0, @inbounds X[i])
         else
-            paccv = isempty(indH) ? accv : @inbounds dest[iH,pD,iT]
-            accv = op(paccv, X[iH,iD,iT])
+            v = op(@inbounds(dest[i - dim_delta]), @inbounds(X[i]))
         end
-
-        S = typeof(accv)
-        if !widen || S === T || S <: T
-            @inbounds dest[i+=1] = accv
+        if !widen || v isa T
+            @inbounds dest[i] = v
         else
-            R = promote_typejoin(T, S)
-            new = similar(dest, R)
-            copyto!(new,1,dest,1,i)
-            @inbounds new[i+=1] = accv
-            return _accumulate!(op, dest, i, v0, X, indH, indD, indT, sH, sD, sT, iH, iD, iT, pD)
+            R = promote_typejoin(T, typeof(v))
+            dest´ = similar(dest, R)
+            copyto!(dest´, 1, dest, 1, linearindices(dest)[i]-1)
+            @inbounds dest´[i] = v
+            return _accumulate!(op, dest´, v0, X, dim, inds, st, first_in_dim, dim_delta, widen)
         end
     end
+    return dest
 end
 
 

From 57ed1b66a2bbb6c376c0bb124a5b63b6ec8a24e3 Mon Sep 17 00:00:00 2001
From: Martin Holters <martin.holters@hsu-hh.de>
Date: Thu, 22 Mar 2018 12:07:32 +0100
Subject: [PATCH 14/15] Handle `dim > ndims(A)` seperately in `accumulate(op,
 A, dims=dim)`

---
 base/accumulate.jl | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/base/accumulate.jl b/base/accumulate.jl
index 5867cda14e23f..e640804d2f876 100644
--- a/base/accumulate.jl
+++ b/base/accumulate.jl
@@ -202,6 +202,9 @@ function _accumulate(op::F, v0, X, ::HasShape{N}, dim::Integer) where {F,N}
     v = reduce_first(op, v0, X[i])
     dest = similar(X, typeof(v))
     dest[i] = v
+    if dim > N
+        return _accum_only_v0!(op, dest, v0, X, inds, st, true)
+    end
     return _accumulate!(op, dest, v0, X, dim, inds, st, first_in_dim, dim_delta, true)
 end
 
@@ -212,13 +215,16 @@ function _accumulate!(op::F, dest, v0, X, ::HasShape{N}, dim::Integer) where {F,
     first_in_dim = first(axes(X, dim))
     dim_delta = CartesianIndex(ntuple(d -> d==dim ? 1 : 0, Val{N}()))
     st = start(inds)
+    if dim > N
+        return _accum_only_v0!(op, dest, v0, X, inds, st, false)
+    end
     return _accumulate!(op, dest, v0, X, dim, inds, st, first_in_dim, dim_delta, false)
 end
 
 function _accumulate!(op::F, dest::AbstractArray{T}, v0, X, dim, inds, st, first_in_dim, dim_delta, widen) where {F,T}
     while !done(inds, st)
         i, st = next(inds, st)
-        if dim > length(i) || i[dim] == first_in_dim
+        if @inbounds i[dim] == first_in_dim
             v = reduce_first(op, v0, @inbounds X[i])
         else
             v = op(@inbounds(dest[i - dim_delta]), @inbounds(X[i]))
@@ -236,6 +242,22 @@ function _accumulate!(op::F, dest::AbstractArray{T}, v0, X, dim, inds, st, first
     return dest
 end
 
+function _accum_only_v0!(op::F, dest::AbstractArray{T}, v0, X, inds, st, widen) where {F,T}
+    while !done(inds, st)
+        i, st = next(inds, st)
+        v = reduce_first(op, v0, @inbounds X[i])
+        if !widen || v isa T
+            @inbounds dest[i] = v
+        else
+            R = promote_typejoin(T, typeof(v))
+            dest´ = similar(dest, R)
+            copyto!(dest´, 1, dest, 1, linearindices(dest)[i]-1)
+            @inbounds dest´[i] = v
+            return _accum_only_v0!(op, dest´, v0, X, inds, st, widen)
+        end
+    end
+    return dest
+end
 
 
 """

From f0bb59dea5e81a40ade507511856bfe122c3c37c Mon Sep 17 00:00:00 2001
From: Simon Byrne <simonbyrne@gmail.com>
Date: Mon, 26 Mar 2018 19:17:45 -0400
Subject: [PATCH 15/15] change name of state variable in collect

---
 base/array.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/base/array.jl b/base/array.jl
index 0f3297f0143e6..8dfb5d47b8ec3 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -562,8 +562,9 @@ function _collect(c, itr, ::EltypeUnknown, isz::Union{HasLength,HasShape})
     if done(itr,st)
         return _similar_for(c, @default_eltype(itr), itr, isz)
     end
-    v1, st = next(itr, st)
-    collect_to_with_first!(_similar_for(c, typeof(v1), itr, isz), v1, itr, st)
+    v1, st2 = next(itr, st)
+    # change name of state variable so that Accumulate is type stable
+    collect_to_with_first!(_similar_for(c, typeof(v1), itr, isz), v1, itr, st2)
 end
 
 function collect_to_with_first!(dest::AbstractArray, v1, itr, st)