From 5ba1c33b49175f9963305f088cd44645f829fe28 Mon Sep 17 00:00:00 2001
From: Sacha Verweij <sacha@stanford.edu>
Date: Sun, 27 Nov 2016 20:51:13 -0800
Subject: [PATCH] Implement map/map! for sparse matrices and reimplement
 broadcast/broadcast! over a single sparse matrix in terms of map/map!.

---
 base/sparse/sparse.jl       |   4 +-
 base/sparse/sparsematrix.jl | 324 +++++++++++++++++++++++++++++++-----
 test/sparse/sparse.jl       |  64 +++++++
 3 files changed, 349 insertions(+), 43 deletions(-)

diff --git a/base/sparse/sparse.jl b/base/sparse/sparse.jl
index 8cd969f4c90952..25bc8906a947fd 100644
--- a/base/sparse/sparse.jl
+++ b/base/sparse/sparse.jl
@@ -2,7 +2,7 @@
 
 module SparseArrays
 
-using Base: ReshapedArray, promote_op, setindex_shape_check, to_shape
+using Base: ReshapedArray, promote_op, setindex_shape_check, to_shape, tail
 using Base.Sort: Forward
 using Base.LinAlg: AbstractTriangular, PosDefException
 
@@ -24,7 +24,7 @@ import Base: @get!, acos, acosd, acot, acotd, acsch, asech, asin, asind, asinh,
     vcat, hcat, hvcat, cat, imag, indmax, ishermitian, kron, length, log, log1p, max, min,
     maximum, minimum, norm, one, promote_eltype, real, reinterpret, reshape, rot180,
     rotl90, rotr90, round, scale!, setindex!, similar, size, transpose, tril,
-    triu, vec, permute!
+    triu, vec, permute!, map, map!
 
 import Base.Broadcast: broadcast_indices
 
diff --git a/base/sparse/sparsematrix.jl b/base/sparse/sparsematrix.jl
index efb9ade2209d3a..2afdb57711291f 100644
--- a/base/sparse/sparsematrix.jl
+++ b/base/sparse/sparsematrix.jl
@@ -1398,58 +1398,300 @@ end
 
 sparse(S::UniformScaling, m::Integer, n::Integer=m) = speye_scaled(S.λ, m, n)
 
-
-## Broadcast operations involving a single sparse matrix and possibly broadcast scalars
-
-function broadcast{Tf}(f::Tf, A::SparseMatrixCSC)
-    fofzero = f(zero(eltype(A)))
-    fpreszero = fofzero == zero(fofzero)
-    return fpreszero ? _broadcast_zeropres(f, A) : _broadcast_notzeropres(f, fofzero, A)
-end
-"Returns a `SparseMatrixCSC` storing only the nonzero entries of `broadcast(f, Matrix(A))`."
-function _broadcast_zeropres{Tf}(f::Tf, A::SparseMatrixCSC)
-    Bcolptr = similar(A.colptr, A.n + 1)
-    Browval = similar(A.rowval, nnz(A))
-    Bnzval = similar(A.nzval, Base.Broadcast.promote_eltype_op(f, A), nnz(A))
-    Bk = 1
-    @inbounds for j in 1:A.n
-        Bcolptr[j] = Bk
+## map/map! over sparse matrices
+
+# map/map! entry points
+function map!{Tf,N}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC, Bs::Vararg{SparseMatrixCSC,N})
+    _checksameshape(C, A, Bs...)
+    fofzeros = f(_zeros_eltypes(A, Bs...)...)
+    fpreszeros = fofzeros == zero(fofzeros)
+    return fpreszeros ? _map_zeropres!(f, C, A, Bs...) :
+                        _map_notzeropres!(f, fofzeros, C, A, Bs...)
+end
+function map{Tf,N}(f::Tf, A::SparseMatrixCSC, Bs::Vararg{SparseMatrixCSC,N})
+    _checksameshape(A, Bs...)
+    fofzeros = f(_zeros_eltypes(A, Bs...)...)
+    fpreszeros = fofzeros == zero(fofzeros)
+    maxnnzC = fpreszeros ? _sumnnzs(A, Bs...) : length(A)
+    entrytypeC = Base.Broadcast.promote_eltype_op(f, A, Bs...)
+    indextypeC = _promote_indtype(A, Bs...)
+    Ccolptr = Vector{indextypeC}(A.n + 1)
+    Crowval = Vector{indextypeC}(maxnnzC)
+    Cnzval = Vector{entrytypeC}(maxnnzC)
+    C = SparseMatrixCSC(A.m, A.n, Ccolptr, Crowval, Cnzval)
+    return fpreszeros ? _map_zeropres!(f, C, A, Bs...) :
+                        _map_notzeropres!(f, fofzeros, C, A, Bs...)
+end
+# map/map! entry point helper functions
+@inline _sumnnzs(A) = nnz(A)
+@inline _sumnnzs(A, Bs...) = nnz(A) + _sumnnzs(Bs...)
+@inline _zeros_eltypes(A) = (zero(eltype(A)),)
+@inline _zeros_eltypes(A, Bs...) = (zero(eltype(A)), _zeros_eltypes(Bs...)...)
+@inline _promote_indtype(A) = eltype(A.rowval)
+@inline _promote_indtype(A, Bs...) = promote_type(eltype(A.rowval), _promote_indtype(Bs...))
+@inline _aresameshape(A) = true
+@inline _aresameshape(A, B) = size(A) == size(B)
+@inline _aresameshape(A, B, Cs...) = _aresameshape(A, B) ? _aresameshape(B, Cs...) : false
+@inline _checksameshape(As...) = _aresameshape(As...) || throw(DimensionMismatch("argument shapes must match"))
+
+# _map_zeropres!/_map_notzeropres! specialized for a single sparse matrix
+"Stores only the nonzero entries of `map(f, Matrix(A))` in `C`."
+function _map_zeropres!{Tf}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC)
+    spaceC::Int = min(length(C.rowval), length(C.nzval))
+    Ck = 1
+    @inbounds for j in 1:C.n
+        C.colptr[j] = Ck
         for Ak in nzrange(A, j)
-            x = f(A.nzval[Ak])
-            if x != 0
-                Browval[Bk] = A.rowval[Ak]
-                Bnzval[Bk] = x
-                Bk += 1
+            Cx = f(A.nzval[Ak])
+            if Cx != zero(eltype(C))
+                if Ck > spaceC
+                    spaceC = maxnnzC = Ck + nnz(A) - (Ak - 1)
+                    length(C.rowval) < maxnnzC && resize!(C.rowval, maxnnzC)
+                    length(C.nzval) < maxnnzC && resize!(C.nzval, maxnnzC)
+                end
+                C.rowval[Ck] = A.rowval[Ak]
+                C.nzval[Ck] = Cx
+                Ck += 1
             end
         end
     end
-    Bcolptr[A.n + 1] = Bk
-    resize!(Browval, Bk - 1)
-    resize!(Bnzval, Bk - 1)
-    return SparseMatrixCSC(A.m, A.n, Bcolptr, Browval, Bnzval)
+    C.colptr[C.n + 1] = Ck
+    return C
 end
 """
-Returns a (dense) `SparseMatrixCSC` with `fillvalue` stored in place of each unstored
-entry in `A` and `f(A[i,j])` stored in place of each stored entry `A[i,j]` in `A`.
+Densifies `C`, storing `fillvalue` in place of each unstored entry in `A` and
+`f(A[i,j])` in place of each stored entry `A[i,j]` in `A`.
 """
-function _broadcast_notzeropres{Tf}(f::Tf, fillvalue, A::SparseMatrixCSC)
-    nnzB = A.m * A.n
-    # Build structure
-    Bcolptr = similar(A.colptr, A.n + 1)
-    copy!(Bcolptr, 1:A.m:(nnzB + 1))
-    Browval = similar(A.rowval, nnzB)
-    for k in 1:A.m:(nnzB - A.m + 1)
-        copy!(Browval, k, 1:A.m)
+function _map_notzeropres!{Tf}(f::Tf, fillvalue, C::SparseMatrixCSC, A::SparseMatrixCSC)
+    nnzC = C.m * C.n
+    # Expand C's storage if necessary
+    length(C.rowval) < nnzC && resize!(C.rowval, nnzC)
+    length(C.nzval) < nnzC && resize!(C.nzval, nnzC)
+    # Build C's structure
+    copy!(C.colptr, 1:C.m:(nnzC + 1))
+    for k in 1:C.m:(nnzC - C.m + 1)
+        copy!(C.rowval, k, 1:C.m)
+    end
+    # Populate values
+    fill!(C.nzval, fillvalue)
+    @inbounds for (j, jo) in zip(1:C.n, 0:C.m:(nnzC - 1)), Ak in nzrange(A, j)
+        Cx = f(A.nzval[Ak])
+        Cx != fillvalue && (C.nzval[jo + A.rowval[Ak]] = Cx)
+    end
+    # NOTE: Combining the fill! above into the loop above to avoid multiple sweeps over /
+    # nonsequential access of C.nzval does not appear to improve performance.
+    return C
+end
+
+# _map_zeropres!/_map_notzeropres! specialized for a pair of sparse matrices
+function _map_zeropres!{Tf}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC)
+    spaceC::Int = min(length(C.rowval), length(C.nzval))
+    rowsentinelA = convert(eltype(A.rowval), C.m + 1)
+    rowsentinelB = convert(eltype(B.rowval), C.m + 1)
+    Ck = 1
+    @inbounds for j in 1:C.n
+        C.colptr[j] = Ck
+        Ak, stopAk = A.colptr[j], A.colptr[j + 1]
+        Bk, stopBk = B.colptr[j], B.colptr[j + 1]
+        Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA
+        Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB
+        while true
+            if Ai == Bi
+                Ai == rowsentinelA && break # column complete
+                Cx, Ci::eltype(C.rowval) = f(A.nzval[Ak], B.nzval[Bk]), Ai
+                Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA
+                Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB
+            elseif Ai < Bi
+                Cx, Ci = f(A.nzval[Ak], zero(eltype(B))), Ai
+                Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA
+            else # Bi < Ai
+                Cx, Ci = f(zero(eltype(A)), B.nzval[Bk]), Bi
+                Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB
+            end
+            # NOTE: The ordering of the conditional chain above impacts which matrices this
+            # method performs best for. The above provides good performance all around.
+            if Cx != zero(eltype(C))
+                if Ck > spaceC
+                    spaceC = maxnnzC = Ck + (nnz(A) - (Ak - 1)) + (nnz(B) - (Bk - 1))
+                    length(C.rowval) < maxnnzC && resize!(C.rowval, maxnnzC)
+                    length(C.nzval) < maxnnzC && resize!(C.nzval, maxnnzC)
+                end
+                C.rowval[Ck] = Ci
+                C.nzval[Ck] = Cx
+                Ck += 1
+            end
+        end
+    end
+    C.colptr[C.n + 1] = Ck
+    return C
+end
+function _map_notzeropres!{Tf}(f::Tf, fillvalue, C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC)
+    nnzC = C.m * C.n
+    # Expand C's storage if necessary
+    length(C.rowval) < nnzC && resize!(C.rowval, nnzC)
+    length(C.nzval) < nnzC && resize!(C.nzval, nnzC)
+    # Build C's structure
+    copy!(C.colptr, 1:C.m:(nnzC + 1))
+    for k in 1:C.m:(nnzC - C.m + 1)
+        copy!(C.rowval, k, 1:C.m)
+    end
+    # Populate values
+    fill!(C.nzval, fillvalue)
+    # NOTE: Combining this fill! into the loop below to avoid multiple sweeps over /
+    # nonsequential access of C.nzval does not appear to improve performance.
+    rowsentinelA = convert(eltype(A.rowval), C.m + 1)
+    rowsentinelB = convert(eltype(B.rowval), C.m + 1)
+    @inbounds for (j, jo) in zip(1:C.n, 0:C.m:(nnzC - 1))
+        Ak, stopAk = A.colptr[j], A.colptr[j + 1]
+        Bk, stopBk = B.colptr[j], B.colptr[j + 1]
+        Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA
+        Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB
+        while true
+            if Ai == Bi
+                Ai == rowsentinelA && break # column complete
+                Cx, Ci::eltype(C.rowval) = f(A.nzval[Ak], B.nzval[Bk]), Ai
+                Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA
+                Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB
+            elseif Ai < Bi
+                Cx, Ci = f(A.nzval[Ak], zero(eltype(B))), Ai
+                Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA
+            else # Bi < Ai
+                Cx, Ci = f(zero(eltype(A)), B.nzval[Bk]), Bi
+                Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB
+            end
+            Cx != fillvalue && (C.nzval[jo + Ci] = Cx)
+        end
+    end
+    return C
+end
+
+# _map_zeropres!/_map_notzeropres! for more than two sparse matrices
+function _map_zeropres!{Tf,N}(f::Tf, C::SparseMatrixCSC, As::Vararg{SparseMatrixCSC,N})
+    spaceC::Int = min(length(C.rowval), length(C.nzval))
+    rowsentinel = C.m + 1
+    Ck = 1
+    stopks = _indforcol_all(1, As)
+    @inbounds for j in 1:C.n
+        C.colptr[j] = Ck
+        ks = stopks
+        stopks = _indforcol_all(j + 1, As)
+        rows = _rowforind_all(rowsentinel, ks, stopks, As)
+        activerow = min(rows...)
+        while activerow < rowsentinel
+            # activerows = _isactiverow_all(activerow, rows)
+            # Cx = f(_gatherargs(activerows, ks, As)...)
+            # ks = _updateind_all(activerows, ks)
+            # rows = _updaterow_all(rowsentinel, activerows, rows, ks, stopks, As)
+            vals, ks, rows = _fusedupdate_all(rowsentinel, activerow, rows, ks, stopks, As)
+            Cx = f(vals...)
+            if Cx != zero(eltype(C))
+                if Ck > spaceC
+                    spaceC = maxnnzC = Ck + _sumnnzs(As...) - (sum(ks) - N)
+                    length(C.rowval) < maxnnzC && resize!(C.rowval, maxnnzC)
+                    length(C.nzval) < maxnnzC && resize!(C.nzval, maxnnzC)
+                end
+                C.rowval[Ck] = activerow
+                C.nzval[Ck] = Cx
+                Ck += 1
+            end
+            activerow = min(rows...)
+        end
+    end
+    C.colptr[C.n + 1] = Ck
+    return C
+end
+function _map_notzeropres!{Tf,N}(f::Tf, fillvalue, C::SparseMatrixCSC, As::Vararg{SparseMatrixCSC,N})
+    nnzC = C.m * C.n
+    # Expand C's storage if necessary
+    length(C.rowval) < nnzC && resize!(C.rowval, nnzC)
+    length(C.nzval) < nnzC && resize!(C.nzval, nnzC)
+    # Build C's structure
+    copy!(C.colptr, 1:C.m:(nnzC + 1))
+    for k in 1:C.m:(nnzC - C.m + 1)
+        copy!(C.rowval, k, 1:C.m)
     end
     # Populate values
-    Bnzval = fill(fillvalue, nnzB)
-    @inbounds for (j, jo) in zip(1:A.n, 0:A.m:(nnzB - 1)), k in nzrange(A, j)
-        Bnzval[jo + A.rowval[k]] = f(A.nzval[k])
+    fill!(C.nzval, fillvalue)
+    # NOTE: Combining this fill! into the loop below to avoid multiple sweeps over /
+    # nonsequential access of C.nzval does not appear to improve performance.
+    rowsentinel = C.m + 1
+    stopks = _indforcol_all(1, As)
+    @inbounds for (j, jo) in zip(1:C.n, 0:C.m:(nnzC - 1))
+        ks = stopks
+        stopks = _indforcol_all(j + 1, As)
+        rows = _rowforind_all(rowsentinel, ks, stopks, As)
+        activerow = min(rows...)
+        while activerow < rowsentinel
+            # activerows = _isactiverow_all(activerow, rows)
+            # Cx = f(_gatherargs(activerows, ks, As)...)
+            # ks = _updateind_all(activerows, ks)
+            # rows = _updaterow_all(rowsentinel, activerows, rows, ks, stopks, As)
+            vals, ks, rows = _fusedupdate_all(rowsentinel, activerow, rows, ks, stopks, As)
+            Cx = f(vals...)
+            Cx != fillvalue && (C.nzval[jo + activerow] = Cx)
+            activerow = min(rows...)
+        end
+    end
+    return C
+end
+# helper methods
+@inline _indforcol(j, A) = A.colptr[j]
+@inline _indforcol_all(j, ::Tuple{}) = ()
+@inline _indforcol_all(j, As) = (
+    _indforcol(j, first(As)),
+    _indforcol_all(j, tail(As))...)
+@inline _rowforind(rowsentinel, k, stopk, A) =
+    k < stopk ? A.rowval[k] : convert(eltype(A.rowval), rowsentinel)
+@inline _rowforind_all(rowsentinel, ::Tuple{}, ::Tuple{}, ::Tuple{}) = ()
+@inline _rowforind_all(rowsentinel, ks, stopks, As) = (
+    _rowforind(rowsentinel, first(ks), first(stopks), first(As)),
+    _rowforind_all(rowsentinel, tail(ks), tail(stopks), tail(As))...)
+# fusing the following defs. avoids a few branches, yielding 5-30% runtime reduction
+# @inline _isactiverow(activerow, row) = row == activerow
+# @inline _isactiverow_all(activerow, ::Tuple{}) = ()
+# @inline _isactiverow_all(activerow, rows) = (
+#     _isactiverow(activerow, first(rows)),
+#     _isactiverow_all(activerow, tail(rows))...)
+# @inline _gatherarg(isactiverow, k, A) = isactiverow ? A.nzval[k] : zero(eltype(A))
+# @inline _gatherargs(::Tuple{}, ::Tuple{}, ::Tuple{}) = ()
+# @inline _gatherargs(activerows, ks, As) = (
+#     _gatherarg(first(activerows), first(ks), first(As)),
+#     _gatherargs(tail(activerows), tail(ks), tail(As))...)
+# @inline _updateind(isactiverow, k) = isactiverow ? (k + one(k)) : k
+# @inline _updateind_all(::Tuple{}, ::Tuple{}) = ()
+# @inline _updateind_all(activerows, ks) = (
+#     _updateind(first(activerows), first(ks)),
+#     _updateind_all(tail(activerows), tail(ks))...)
+# @inline _updaterow(rowsentinel, isrowactive, presrow, k, stopk, A) =
+#     isrowactive ? (k < stopk ? A.rowval[k] : oftype(presrow, rowsentinel)) : presrow
+# @inline _updaterow_all(rowsentinel, ::Tuple{}, ::Tuple{}, ::Tuple{}, ::Tuple{}, ::Tuple{}) = ()
+# @inline _updaterow_all(rowsentinel, activerows, rows, ks, stopks, As) = (
+#     _updaterow(rowsentinel, first(activerows), first(rows), first(ks), first(stopks), first(As)),
+#     _updaterow_all(rowsentinel, tail(activerows), tail(rows), tail(ks), tail(stopks), tail(As))...)
+@inline function _fusedupdate(rowsentinel, activerow, row, k, stopk, A)
+    # returns (val, nextk, nextrow)
+    if row == activerow
+        nextk = k + one(k)
+        (A.nzval[k], nextk, (nextk < stopk ? A.rowval[nextk] : oftype(row, rowsentinel)))
+    else
+        (zero(eltype(A)), k, row)
     end
-    # NOTE: Combining the fill call into the loop above to avoid multiple sweeps over /
-    # nonsequential access of Bnzval does not appear to improve performance
-    return SparseMatrixCSC(A.m, A.n, Bcolptr, Browval, Bnzval)
 end
+@inline _fusedupdate_all(rowsentinel, activerow, rows, ks, stopks, As) =
+    _fusedupdate_all((#=vals=#), (#=nextks=#), (#=nextrows=#), rowsentinel, activerow, rows, ks, stopks, As)
+@inline _fusedupdate_all(vals, nextks, nextrows, rowsent, activerow, ::Tuple{}, ::Tuple{}, ::Tuple{}, ::Tuple{}) =
+    (vals, nextks, nextrows)
+@inline function _fusedupdate_all(vals, nextks, nextrows, rowsentinel, activerow, rows, ks, stopks, As)
+    val, nextk, nextrow = _fusedupdate(rowsentinel, activerow, first(rows), first(ks), first(stopks), first(As))
+    return _fusedupdate_all((vals..., val), (nextks..., nextk), (nextrows..., nextrow),
+                        rowsentinel, activerow, tail(rows), tail(ks), tail(stopks), tail(As))
+end
+
+
+## Broadcast operations involving a single sparse matrix and possibly broadcast scalars
+
+broadcast{Tf}(f::Tf, A::SparseMatrixCSC) = map(f, A)
+broadcast!{Tf}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC) = map!(f, C, A)
 
 # Cover common broadcast operations involving a single sparse matrix and one or more
 # broadcast scalars.
diff --git a/test/sparse/sparse.jl b/test/sparse/sparse.jl
index 61596370f8bcfc..4c92969d6b550d 100644
--- a/test/sparse/sparse.jl
+++ b/test/sparse/sparse.jl
@@ -1659,3 +1659,67 @@ let X = sparse([1 -1; -1 1])
         @test Y / 1 == Y
     end
 end
+
+# test map/map! over sparse matrices
+let
+    N, M = 10, 12
+    # test map/map! implementation specialized for a single (input) sparse matrix
+    # (also tested through broadcast/broadcast! over a single (input) sparse matrix)
+    # --> test map entry point
+    A = sprand(N, M, 0.4)
+    fA = Array(A)
+    @test map(sin, A) == sparse(map(sin, fA))
+    @test map(cos, A) == sparse(map(cos, fA))
+    # --> test map! entry point
+    fX = copy(fA); X = sparse(fX)
+    map!(sin, X, A); X = sparse(fX) # warmup for @allocated
+    @test (@allocated map!(sin, X, A)) == 0
+    @test map!(sin, X, A) == sparse(map!(sin, fX, fA))
+    @test map!(cos, X, A) == sparse(map!(cos, fX, fA))
+    @test_throws DimensionMismatch map!(sin, X, spzeros(N, M - 1))
+    # test map/map! implementation specialized for a pair of (input) sparse matrices
+    f(x, y) = x + y + 1
+    A = sprand(N, M, 0.3)
+    B = convert(SparseMatrixCSC{Float32,Int32}, sprand(N, M, 0.3))
+    # use different types to check internal type stability via allocation tests below
+    fA, fB = map(Array, (A, B))
+    # --> test map entry point
+    @test map(+, A, B) == sparse(map(+, fA, fB))
+    @test map(*, A, B) == sparse(map(*, fA, fB))
+    @test map(f, A, B) == sparse(map(f, fA, fB))
+    @test_throws DimensionMismatch map(+, A, spzeros(N, M - 1))
+    # --> test map! entry point
+    fX = fA .+ fB; X = sparse(fX)
+    map!(+, X, A, B); X = sparse(fX) # warmup for @allocated
+    @test (@allocated map!(+, X, A, B)) == 0
+    @test map!(+, X, A, B) == sparse(map!(+, fX, fA, fB))
+    fX = fA .* fB; X = sparse(fX)
+    map!(*, X, A, B); X = sparse(fX) # warmup for @allocated
+    @test (@allocated map!(*, X, A, B)) == 0
+    @test map!(*, X, A, B) == sparse(map!(*, fX, fA, fB))
+    @test map!(f, X, A, B) == sparse(map!(f, fX, fA, fB))
+    @test_throws DimensionMismatch map!(f, X, A, spzeros(N, M - 1))
+    # test map/map! implementation for an arbitrary number of (input) sparse matrices
+    f(x, y, z) = x + y + z + 1
+    A = sprand(N, M, 0.2)
+    B = sprand(N, M, 0.2)
+    C = convert(SparseMatrixCSC{Float32,Int32}, sprand(N, M, 0.2))
+    # use different types to check internal type stability via allocation tests below
+    fA, fB, fC = map(Array, (A, B, C))
+    # --> test map entry point
+    @test map(+, A, B, C) == sparse(map(+, fA, fB, fC))
+    @test map(*, A, B, C) == sparse(map(*, fA, fB, fC))
+    @test map(f, A, B, C) == sparse(map(f, fA, fB, fC))
+    @test_throws DimensionMismatch map(+, A, B, spzeros(N, M - 1))
+    # --> test map! entry point
+    fX = fA .+ fB .+ fC; X = sparse(fX)
+    map!(+, X, A, B, C); X = sparse(fX) # warmup for @allocated
+    @test (@allocated map!(+, X, A, B, C)) == 0
+    @test map!(+, X, A, B, C) == sparse(map!(+, fX, fA, fB, fC))
+    fX = fA .* fB .* fC; X = sparse(fX)
+    map!(*, X, A, B, C); X = sparse(fX) # warmup for @allocated
+    @test (@allocated map!(*, X, A, B, C)) == 0
+    @test map!(*, X, A, B, C) == sparse(map!(*, fX, fA, fB, fC))
+    @test map!(f, X, A, B, C) == sparse(map!(f, fX, fA, fB, fC))
+    @test_throws DimensionMismatch map!(f, X, A, B, spzeros(N, M - 1))
+end