From 5ba1c33b49175f9963305f088cd44645f829fe28 Mon Sep 17 00:00:00 2001 From: Sacha Verweij Date: Sun, 27 Nov 2016 20:51:13 -0800 Subject: [PATCH] Implement map/map! for sparse matrices and reimplement broadcast/broadcast! over a single sparse matrix in terms of map/map!. --- base/sparse/sparse.jl | 4 +- base/sparse/sparsematrix.jl | 324 +++++++++++++++++++++++++++++++----- test/sparse/sparse.jl | 64 +++++++ 3 files changed, 349 insertions(+), 43 deletions(-) diff --git a/base/sparse/sparse.jl b/base/sparse/sparse.jl index 8cd969f4c90952..25bc8906a947fd 100644 --- a/base/sparse/sparse.jl +++ b/base/sparse/sparse.jl @@ -2,7 +2,7 @@ module SparseArrays -using Base: ReshapedArray, promote_op, setindex_shape_check, to_shape +using Base: ReshapedArray, promote_op, setindex_shape_check, to_shape, tail using Base.Sort: Forward using Base.LinAlg: AbstractTriangular, PosDefException @@ -24,7 +24,7 @@ import Base: @get!, acos, acosd, acot, acotd, acsch, asech, asin, asind, asinh, vcat, hcat, hvcat, cat, imag, indmax, ishermitian, kron, length, log, log1p, max, min, maximum, minimum, norm, one, promote_eltype, real, reinterpret, reshape, rot180, rotl90, rotr90, round, scale!, setindex!, similar, size, transpose, tril, - triu, vec, permute! + triu, vec, permute!, map, map! import Base.Broadcast: broadcast_indices diff --git a/base/sparse/sparsematrix.jl b/base/sparse/sparsematrix.jl index efb9ade2209d3a..2afdb57711291f 100644 --- a/base/sparse/sparsematrix.jl +++ b/base/sparse/sparsematrix.jl @@ -1398,58 +1398,300 @@ end sparse(S::UniformScaling, m::Integer, n::Integer=m) = speye_scaled(S.λ, m, n) - -## Broadcast operations involving a single sparse matrix and possibly broadcast scalars - -function broadcast{Tf}(f::Tf, A::SparseMatrixCSC) - fofzero = f(zero(eltype(A))) - fpreszero = fofzero == zero(fofzero) - return fpreszero ? _broadcast_zeropres(f, A) : _broadcast_notzeropres(f, fofzero, A) -end -"Returns a `SparseMatrixCSC` storing only the nonzero entries of `broadcast(f, Matrix(A))`." -function _broadcast_zeropres{Tf}(f::Tf, A::SparseMatrixCSC) - Bcolptr = similar(A.colptr, A.n + 1) - Browval = similar(A.rowval, nnz(A)) - Bnzval = similar(A.nzval, Base.Broadcast.promote_eltype_op(f, A), nnz(A)) - Bk = 1 - @inbounds for j in 1:A.n - Bcolptr[j] = Bk +## map/map! over sparse matrices + +# map/map! entry points +function map!{Tf,N}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC, Bs::Vararg{SparseMatrixCSC,N}) + _checksameshape(C, A, Bs...) + fofzeros = f(_zeros_eltypes(A, Bs...)...) + fpreszeros = fofzeros == zero(fofzeros) + return fpreszeros ? _map_zeropres!(f, C, A, Bs...) : + _map_notzeropres!(f, fofzeros, C, A, Bs...) +end +function map{Tf,N}(f::Tf, A::SparseMatrixCSC, Bs::Vararg{SparseMatrixCSC,N}) + _checksameshape(A, Bs...) + fofzeros = f(_zeros_eltypes(A, Bs...)...) + fpreszeros = fofzeros == zero(fofzeros) + maxnnzC = fpreszeros ? _sumnnzs(A, Bs...) : length(A) + entrytypeC = Base.Broadcast.promote_eltype_op(f, A, Bs...) + indextypeC = _promote_indtype(A, Bs...) + Ccolptr = Vector{indextypeC}(A.n + 1) + Crowval = Vector{indextypeC}(maxnnzC) + Cnzval = Vector{entrytypeC}(maxnnzC) + C = SparseMatrixCSC(A.m, A.n, Ccolptr, Crowval, Cnzval) + return fpreszeros ? _map_zeropres!(f, C, A, Bs...) : + _map_notzeropres!(f, fofzeros, C, A, Bs...) +end +# map/map! entry point helper functions +@inline _sumnnzs(A) = nnz(A) +@inline _sumnnzs(A, Bs...) = nnz(A) + _sumnnzs(Bs...) +@inline _zeros_eltypes(A) = (zero(eltype(A)),) +@inline _zeros_eltypes(A, Bs...) = (zero(eltype(A)), _zeros_eltypes(Bs...)...) +@inline _promote_indtype(A) = eltype(A.rowval) +@inline _promote_indtype(A, Bs...) = promote_type(eltype(A.rowval), _promote_indtype(Bs...)) +@inline _aresameshape(A) = true +@inline _aresameshape(A, B) = size(A) == size(B) +@inline _aresameshape(A, B, Cs...) = _aresameshape(A, B) ? _aresameshape(B, Cs...) : false +@inline _checksameshape(As...) = _aresameshape(As...) || throw(DimensionMismatch("argument shapes must match")) + +# _map_zeropres!/_map_notzeropres! specialized for a single sparse matrix +"Stores only the nonzero entries of `map(f, Matrix(A))` in `C`." +function _map_zeropres!{Tf}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC) + spaceC::Int = min(length(C.rowval), length(C.nzval)) + Ck = 1 + @inbounds for j in 1:C.n + C.colptr[j] = Ck for Ak in nzrange(A, j) - x = f(A.nzval[Ak]) - if x != 0 - Browval[Bk] = A.rowval[Ak] - Bnzval[Bk] = x - Bk += 1 + Cx = f(A.nzval[Ak]) + if Cx != zero(eltype(C)) + if Ck > spaceC + spaceC = maxnnzC = Ck + nnz(A) - (Ak - 1) + length(C.rowval) < maxnnzC && resize!(C.rowval, maxnnzC) + length(C.nzval) < maxnnzC && resize!(C.nzval, maxnnzC) + end + C.rowval[Ck] = A.rowval[Ak] + C.nzval[Ck] = Cx + Ck += 1 end end end - Bcolptr[A.n + 1] = Bk - resize!(Browval, Bk - 1) - resize!(Bnzval, Bk - 1) - return SparseMatrixCSC(A.m, A.n, Bcolptr, Browval, Bnzval) + C.colptr[C.n + 1] = Ck + return C end """ -Returns a (dense) `SparseMatrixCSC` with `fillvalue` stored in place of each unstored -entry in `A` and `f(A[i,j])` stored in place of each stored entry `A[i,j]` in `A`. +Densifies `C`, storing `fillvalue` in place of each unstored entry in `A` and +`f(A[i,j])` in place of each stored entry `A[i,j]` in `A`. """ -function _broadcast_notzeropres{Tf}(f::Tf, fillvalue, A::SparseMatrixCSC) - nnzB = A.m * A.n - # Build structure - Bcolptr = similar(A.colptr, A.n + 1) - copy!(Bcolptr, 1:A.m:(nnzB + 1)) - Browval = similar(A.rowval, nnzB) - for k in 1:A.m:(nnzB - A.m + 1) - copy!(Browval, k, 1:A.m) +function _map_notzeropres!{Tf}(f::Tf, fillvalue, C::SparseMatrixCSC, A::SparseMatrixCSC) + nnzC = C.m * C.n + # Expand C's storage if necessary + length(C.rowval) < nnzC && resize!(C.rowval, nnzC) + length(C.nzval) < nnzC && resize!(C.nzval, nnzC) + # Build C's structure + copy!(C.colptr, 1:C.m:(nnzC + 1)) + for k in 1:C.m:(nnzC - C.m + 1) + copy!(C.rowval, k, 1:C.m) + end + # Populate values + fill!(C.nzval, fillvalue) + @inbounds for (j, jo) in zip(1:C.n, 0:C.m:(nnzC - 1)), Ak in nzrange(A, j) + Cx = f(A.nzval[Ak]) + Cx != fillvalue && (C.nzval[jo + A.rowval[Ak]] = Cx) + end + # NOTE: Combining the fill! above into the loop above to avoid multiple sweeps over / + # nonsequential access of C.nzval does not appear to improve performance. + return C +end + +# _map_zeropres!/_map_notzeropres! specialized for a pair of sparse matrices +function _map_zeropres!{Tf}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + spaceC::Int = min(length(C.rowval), length(C.nzval)) + rowsentinelA = convert(eltype(A.rowval), C.m + 1) + rowsentinelB = convert(eltype(B.rowval), C.m + 1) + Ck = 1 + @inbounds for j in 1:C.n + C.colptr[j] = Ck + Ak, stopAk = A.colptr[j], A.colptr[j + 1] + Bk, stopBk = B.colptr[j], B.colptr[j + 1] + Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA + Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB + while true + if Ai == Bi + Ai == rowsentinelA && break # column complete + Cx, Ci::eltype(C.rowval) = f(A.nzval[Ak], B.nzval[Bk]), Ai + Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA + Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB + elseif Ai < Bi + Cx, Ci = f(A.nzval[Ak], zero(eltype(B))), Ai + Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA + else # Bi < Ai + Cx, Ci = f(zero(eltype(A)), B.nzval[Bk]), Bi + Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB + end + # NOTE: The ordering of the conditional chain above impacts which matrices this + # method performs best for. The above provides good performance all around. + if Cx != zero(eltype(C)) + if Ck > spaceC + spaceC = maxnnzC = Ck + (nnz(A) - (Ak - 1)) + (nnz(B) - (Bk - 1)) + length(C.rowval) < maxnnzC && resize!(C.rowval, maxnnzC) + length(C.nzval) < maxnnzC && resize!(C.nzval, maxnnzC) + end + C.rowval[Ck] = Ci + C.nzval[Ck] = Cx + Ck += 1 + end + end + end + C.colptr[C.n + 1] = Ck + return C +end +function _map_notzeropres!{Tf}(f::Tf, fillvalue, C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + nnzC = C.m * C.n + # Expand C's storage if necessary + length(C.rowval) < nnzC && resize!(C.rowval, nnzC) + length(C.nzval) < nnzC && resize!(C.nzval, nnzC) + # Build C's structure + copy!(C.colptr, 1:C.m:(nnzC + 1)) + for k in 1:C.m:(nnzC - C.m + 1) + copy!(C.rowval, k, 1:C.m) + end + # Populate values + fill!(C.nzval, fillvalue) + # NOTE: Combining this fill! into the loop below to avoid multiple sweeps over / + # nonsequential access of C.nzval does not appear to improve performance. + rowsentinelA = convert(eltype(A.rowval), C.m + 1) + rowsentinelB = convert(eltype(B.rowval), C.m + 1) + @inbounds for (j, jo) in zip(1:C.n, 0:C.m:(nnzC - 1)) + Ak, stopAk = A.colptr[j], A.colptr[j + 1] + Bk, stopBk = B.colptr[j], B.colptr[j + 1] + Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA + Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB + while true + if Ai == Bi + Ai == rowsentinelA && break # column complete + Cx, Ci::eltype(C.rowval) = f(A.nzval[Ak], B.nzval[Bk]), Ai + Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA + Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB + elseif Ai < Bi + Cx, Ci = f(A.nzval[Ak], zero(eltype(B))), Ai + Ak += one(Ak); Ai = Ak < stopAk ? A.rowval[Ak] : rowsentinelA + else # Bi < Ai + Cx, Ci = f(zero(eltype(A)), B.nzval[Bk]), Bi + Bk += one(Bk); Bi = Bk < stopBk ? B.rowval[Bk] : rowsentinelB + end + Cx != fillvalue && (C.nzval[jo + Ci] = Cx) + end + end + return C +end + +# _map_zeropres!/_map_notzeropres! for more than two sparse matrices +function _map_zeropres!{Tf,N}(f::Tf, C::SparseMatrixCSC, As::Vararg{SparseMatrixCSC,N}) + spaceC::Int = min(length(C.rowval), length(C.nzval)) + rowsentinel = C.m + 1 + Ck = 1 + stopks = _indforcol_all(1, As) + @inbounds for j in 1:C.n + C.colptr[j] = Ck + ks = stopks + stopks = _indforcol_all(j + 1, As) + rows = _rowforind_all(rowsentinel, ks, stopks, As) + activerow = min(rows...) + while activerow < rowsentinel + # activerows = _isactiverow_all(activerow, rows) + # Cx = f(_gatherargs(activerows, ks, As)...) + # ks = _updateind_all(activerows, ks) + # rows = _updaterow_all(rowsentinel, activerows, rows, ks, stopks, As) + vals, ks, rows = _fusedupdate_all(rowsentinel, activerow, rows, ks, stopks, As) + Cx = f(vals...) + if Cx != zero(eltype(C)) + if Ck > spaceC + spaceC = maxnnzC = Ck + _sumnnzs(As...) - (sum(ks) - N) + length(C.rowval) < maxnnzC && resize!(C.rowval, maxnnzC) + length(C.nzval) < maxnnzC && resize!(C.nzval, maxnnzC) + end + C.rowval[Ck] = activerow + C.nzval[Ck] = Cx + Ck += 1 + end + activerow = min(rows...) + end + end + C.colptr[C.n + 1] = Ck + return C +end +function _map_notzeropres!{Tf,N}(f::Tf, fillvalue, C::SparseMatrixCSC, As::Vararg{SparseMatrixCSC,N}) + nnzC = C.m * C.n + # Expand C's storage if necessary + length(C.rowval) < nnzC && resize!(C.rowval, nnzC) + length(C.nzval) < nnzC && resize!(C.nzval, nnzC) + # Build C's structure + copy!(C.colptr, 1:C.m:(nnzC + 1)) + for k in 1:C.m:(nnzC - C.m + 1) + copy!(C.rowval, k, 1:C.m) end # Populate values - Bnzval = fill(fillvalue, nnzB) - @inbounds for (j, jo) in zip(1:A.n, 0:A.m:(nnzB - 1)), k in nzrange(A, j) - Bnzval[jo + A.rowval[k]] = f(A.nzval[k]) + fill!(C.nzval, fillvalue) + # NOTE: Combining this fill! into the loop below to avoid multiple sweeps over / + # nonsequential access of C.nzval does not appear to improve performance. + rowsentinel = C.m + 1 + stopks = _indforcol_all(1, As) + @inbounds for (j, jo) in zip(1:C.n, 0:C.m:(nnzC - 1)) + ks = stopks + stopks = _indforcol_all(j + 1, As) + rows = _rowforind_all(rowsentinel, ks, stopks, As) + activerow = min(rows...) + while activerow < rowsentinel + # activerows = _isactiverow_all(activerow, rows) + # Cx = f(_gatherargs(activerows, ks, As)...) + # ks = _updateind_all(activerows, ks) + # rows = _updaterow_all(rowsentinel, activerows, rows, ks, stopks, As) + vals, ks, rows = _fusedupdate_all(rowsentinel, activerow, rows, ks, stopks, As) + Cx = f(vals...) + Cx != fillvalue && (C.nzval[jo + activerow] = Cx) + activerow = min(rows...) + end + end + return C +end +# helper methods +@inline _indforcol(j, A) = A.colptr[j] +@inline _indforcol_all(j, ::Tuple{}) = () +@inline _indforcol_all(j, As) = ( + _indforcol(j, first(As)), + _indforcol_all(j, tail(As))...) +@inline _rowforind(rowsentinel, k, stopk, A) = + k < stopk ? A.rowval[k] : convert(eltype(A.rowval), rowsentinel) +@inline _rowforind_all(rowsentinel, ::Tuple{}, ::Tuple{}, ::Tuple{}) = () +@inline _rowforind_all(rowsentinel, ks, stopks, As) = ( + _rowforind(rowsentinel, first(ks), first(stopks), first(As)), + _rowforind_all(rowsentinel, tail(ks), tail(stopks), tail(As))...) +# fusing the following defs. avoids a few branches, yielding 5-30% runtime reduction +# @inline _isactiverow(activerow, row) = row == activerow +# @inline _isactiverow_all(activerow, ::Tuple{}) = () +# @inline _isactiverow_all(activerow, rows) = ( +# _isactiverow(activerow, first(rows)), +# _isactiverow_all(activerow, tail(rows))...) +# @inline _gatherarg(isactiverow, k, A) = isactiverow ? A.nzval[k] : zero(eltype(A)) +# @inline _gatherargs(::Tuple{}, ::Tuple{}, ::Tuple{}) = () +# @inline _gatherargs(activerows, ks, As) = ( +# _gatherarg(first(activerows), first(ks), first(As)), +# _gatherargs(tail(activerows), tail(ks), tail(As))...) +# @inline _updateind(isactiverow, k) = isactiverow ? (k + one(k)) : k +# @inline _updateind_all(::Tuple{}, ::Tuple{}) = () +# @inline _updateind_all(activerows, ks) = ( +# _updateind(first(activerows), first(ks)), +# _updateind_all(tail(activerows), tail(ks))...) +# @inline _updaterow(rowsentinel, isrowactive, presrow, k, stopk, A) = +# isrowactive ? (k < stopk ? A.rowval[k] : oftype(presrow, rowsentinel)) : presrow +# @inline _updaterow_all(rowsentinel, ::Tuple{}, ::Tuple{}, ::Tuple{}, ::Tuple{}, ::Tuple{}) = () +# @inline _updaterow_all(rowsentinel, activerows, rows, ks, stopks, As) = ( +# _updaterow(rowsentinel, first(activerows), first(rows), first(ks), first(stopks), first(As)), +# _updaterow_all(rowsentinel, tail(activerows), tail(rows), tail(ks), tail(stopks), tail(As))...) +@inline function _fusedupdate(rowsentinel, activerow, row, k, stopk, A) + # returns (val, nextk, nextrow) + if row == activerow + nextk = k + one(k) + (A.nzval[k], nextk, (nextk < stopk ? A.rowval[nextk] : oftype(row, rowsentinel))) + else + (zero(eltype(A)), k, row) end - # NOTE: Combining the fill call into the loop above to avoid multiple sweeps over / - # nonsequential access of Bnzval does not appear to improve performance - return SparseMatrixCSC(A.m, A.n, Bcolptr, Browval, Bnzval) end +@inline _fusedupdate_all(rowsentinel, activerow, rows, ks, stopks, As) = + _fusedupdate_all((#=vals=#), (#=nextks=#), (#=nextrows=#), rowsentinel, activerow, rows, ks, stopks, As) +@inline _fusedupdate_all(vals, nextks, nextrows, rowsent, activerow, ::Tuple{}, ::Tuple{}, ::Tuple{}, ::Tuple{}) = + (vals, nextks, nextrows) +@inline function _fusedupdate_all(vals, nextks, nextrows, rowsentinel, activerow, rows, ks, stopks, As) + val, nextk, nextrow = _fusedupdate(rowsentinel, activerow, first(rows), first(ks), first(stopks), first(As)) + return _fusedupdate_all((vals..., val), (nextks..., nextk), (nextrows..., nextrow), + rowsentinel, activerow, tail(rows), tail(ks), tail(stopks), tail(As)) +end + + +## Broadcast operations involving a single sparse matrix and possibly broadcast scalars + +broadcast{Tf}(f::Tf, A::SparseMatrixCSC) = map(f, A) +broadcast!{Tf}(f::Tf, C::SparseMatrixCSC, A::SparseMatrixCSC) = map!(f, C, A) # Cover common broadcast operations involving a single sparse matrix and one or more # broadcast scalars. diff --git a/test/sparse/sparse.jl b/test/sparse/sparse.jl index 61596370f8bcfc..4c92969d6b550d 100644 --- a/test/sparse/sparse.jl +++ b/test/sparse/sparse.jl @@ -1659,3 +1659,67 @@ let X = sparse([1 -1; -1 1]) @test Y / 1 == Y end end + +# test map/map! over sparse matrices +let + N, M = 10, 12 + # test map/map! implementation specialized for a single (input) sparse matrix + # (also tested through broadcast/broadcast! over a single (input) sparse matrix) + # --> test map entry point + A = sprand(N, M, 0.4) + fA = Array(A) + @test map(sin, A) == sparse(map(sin, fA)) + @test map(cos, A) == sparse(map(cos, fA)) + # --> test map! entry point + fX = copy(fA); X = sparse(fX) + map!(sin, X, A); X = sparse(fX) # warmup for @allocated + @test (@allocated map!(sin, X, A)) == 0 + @test map!(sin, X, A) == sparse(map!(sin, fX, fA)) + @test map!(cos, X, A) == sparse(map!(cos, fX, fA)) + @test_throws DimensionMismatch map!(sin, X, spzeros(N, M - 1)) + # test map/map! implementation specialized for a pair of (input) sparse matrices + f(x, y) = x + y + 1 + A = sprand(N, M, 0.3) + B = convert(SparseMatrixCSC{Float32,Int32}, sprand(N, M, 0.3)) + # use different types to check internal type stability via allocation tests below + fA, fB = map(Array, (A, B)) + # --> test map entry point + @test map(+, A, B) == sparse(map(+, fA, fB)) + @test map(*, A, B) == sparse(map(*, fA, fB)) + @test map(f, A, B) == sparse(map(f, fA, fB)) + @test_throws DimensionMismatch map(+, A, spzeros(N, M - 1)) + # --> test map! entry point + fX = fA .+ fB; X = sparse(fX) + map!(+, X, A, B); X = sparse(fX) # warmup for @allocated + @test (@allocated map!(+, X, A, B)) == 0 + @test map!(+, X, A, B) == sparse(map!(+, fX, fA, fB)) + fX = fA .* fB; X = sparse(fX) + map!(*, X, A, B); X = sparse(fX) # warmup for @allocated + @test (@allocated map!(*, X, A, B)) == 0 + @test map!(*, X, A, B) == sparse(map!(*, fX, fA, fB)) + @test map!(f, X, A, B) == sparse(map!(f, fX, fA, fB)) + @test_throws DimensionMismatch map!(f, X, A, spzeros(N, M - 1)) + # test map/map! implementation for an arbitrary number of (input) sparse matrices + f(x, y, z) = x + y + z + 1 + A = sprand(N, M, 0.2) + B = sprand(N, M, 0.2) + C = convert(SparseMatrixCSC{Float32,Int32}, sprand(N, M, 0.2)) + # use different types to check internal type stability via allocation tests below + fA, fB, fC = map(Array, (A, B, C)) + # --> test map entry point + @test map(+, A, B, C) == sparse(map(+, fA, fB, fC)) + @test map(*, A, B, C) == sparse(map(*, fA, fB, fC)) + @test map(f, A, B, C) == sparse(map(f, fA, fB, fC)) + @test_throws DimensionMismatch map(+, A, B, spzeros(N, M - 1)) + # --> test map! entry point + fX = fA .+ fB .+ fC; X = sparse(fX) + map!(+, X, A, B, C); X = sparse(fX) # warmup for @allocated + @test (@allocated map!(+, X, A, B, C)) == 0 + @test map!(+, X, A, B, C) == sparse(map!(+, fX, fA, fB, fC)) + fX = fA .* fB .* fC; X = sparse(fX) + map!(*, X, A, B, C); X = sparse(fX) # warmup for @allocated + @test (@allocated map!(*, X, A, B, C)) == 0 + @test map!(*, X, A, B, C) == sparse(map!(*, fX, fA, fB, fC)) + @test map!(f, X, A, B, C) == sparse(map!(f, fX, fA, fB, fC)) + @test_throws DimensionMismatch map!(f, X, A, B, spzeros(N, M - 1)) +end