JuliaLang · tkelman · Mar 9, 2017 · Mar 7, 2017 · Mar 7, 2017
diff --git a/base/sparse/higherorderfns.jl b/base/sparse/higherorderfns.jl
@@ -515,24 +515,23 @@ function _broadcast_zeropres!{Tf}(f::Tf, C::SparseVecOrMat, A::SparseVecOrMat, B
     spaceC::Int = min(length(storedinds(C)), length(storedvals(C)))
     rowsentinelA = convert(indtype(A), numrows(C) + 1)
     rowsentinelB = convert(indtype(B), numrows(C) + 1)
-    # A and B cannot have the same shape, as we directed that case to map in broadcast's
-    # entry point; here we need efficiently handle only heterogeneous combinations of matrices
-    # with no singleton dimensions ("matrices" hereafter), one singleton dimension ("columns"
-    # and "rows"), and two singleton dimensions ("scalars"). Cases involving scalars should
-    # be rare and optimizing that case complicates the code appreciably, so we largely
-    # ignore that case's performance below.
+    # C, A, and B cannot all have the same shape, as we directed that case to map in broadcast's
+    # entry point; here we need efficiently handle only heterogeneous combinations of mats/vecs
+    # with no singleton dimensions, one singleton dimension, and two singleton dimensions.
+    # Cases involving objects with two singleton dimensions should be rare and optimizing
+    # that case complicates the code appreciably, so we largely ignore that case's
+    # performance below.
     #
-    # We first divide the cases into two groups: those in which neither argument expands
-    # vertically (matrix-column combinations) and those in which an argument expands
-    # vertically (matrix-row and column-row combinations).
+    # We first divide the cases into two groups: those in which neither input argument
+    # expands vertically, and those in which at least one argument expands vertically.
     #
     # NOTE: Placing the loops over columns outside the conditional chain segregating
     # argument shape combinations eliminates some code replication but unfortunately
     # hurts performance appreciably in some cases.
     #
     # Cases without vertical expansion
     Ck = 1
-    if numrows(A) == numrows(B)
+    if numrows(A) == numrows(B) == numrows(C)
         @inbounds for j in columns(C)
             setcolptr!(C, j, Ck)
             Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
@@ -575,7 +574,24 @@ function _broadcast_zeropres!{Tf}(f::Tf, C::SparseVecOrMat, A::SparseVecOrMat, B
             end
         end
     # Cases with vertical expansion
-    elseif numrows(A) == 1 # && numrows(B) != 1, vertically expand first argument
+    elseif numrows(A) == numrows(B) == 1 # && numrows(C) != 1, vertically expand both A and B
+        @inbounds for j in columns(C)
+            setcolptr!(C, j, Ck)
+            Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
+            Bk, stopBk = numcols(B) == 1 ? (colstartind(B, 1), colboundind(B, 1)) : (colstartind(B, j), colboundind(B, j))
+            Ax = Ak < stopAk ? storedvals(A)[Ak] : zero(eltype(A))
+            Bx = Bk < stopBk ? storedvals(B)[Bk] : zero(eltype(B))
+            Cx = f(Ax, Bx)
+            if !_iszero(Cx)
+                for Ci::indtype(C) in 1:numrows(C)
+                    Ck > spaceC && (spaceC = expandstorage!(C, _unchecked_maxnnzbcres(size(C), A, B)))
+                    storedinds(C)[Ck] = Ci
+                    storedvals(C)[Ck] = Cx
+                    Ck += 1
+                end
+            end
+        end
+    elseif numrows(A) == 1 # && numrows(B) == numrows(C) != 1 , vertically expand only A
         @inbounds for j in columns(C)
             setcolptr!(C, j, Ck)
             Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
@@ -616,7 +632,7 @@ function _broadcast_zeropres!{Tf}(f::Tf, C::SparseVecOrMat, A::SparseVecOrMat, B
                 end
             end
         end
-    elseif numrows(B) == 1 # && numrows(A) != 1, vertically expand second argument
+    else # numrows(B) == 1 && numrows(A) == numrows(C) != 1, vertically expand only B
         @inbounds for j in columns(C)
             setcolptr!(C, j, Ck)
             Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
@@ -671,7 +687,7 @@ function _broadcast_notzeropres!{Tf}(f::Tf, fillvalue, C::SparseVecOrMat, A::Spa
     rowsentinelA = convert(indtype(A), numrows(C) + 1)
     rowsentinelB = convert(indtype(B), numrows(C) + 1)
     # Cases without vertical expansion
-    if numrows(A) == numrows(B)
+    if numrows(A) == numrows(B) == numrows(C)
         @inbounds for (j, jo) in zip(columns(C), _densecoloffsets(C))
             Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
             Bk, stopBk = numcols(B) == 1 ? (colstartind(B, 1), colboundind(B, 1)) : (colstartind(B, j), colboundind(B, j))
@@ -695,7 +711,20 @@ function _broadcast_notzeropres!{Tf}(f::Tf, fillvalue, C::SparseVecOrMat, A::Spa
             end
         end
     # Cases with vertical expansion
-    elseif numrows(A) == 1 # && numrows(B) != 1, vertically expand first argument
+    elseif numrows(A) == numrows(B) == 1 # && numrows(C) != 1, vertically expand both A and B
+        @inbounds for (j, jo) in zip(columns(C), _densecoloffsets(C))
+            Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
+            Bk, stopBk = numcols(B) == 1 ? (colstartind(B, 1), colboundind(B, 1)) : (colstartind(B, j), colboundind(B, j))
+            Ax = Ak < stopAk ? storedvals(A)[Ak] : zero(eltype(A))
+            Bx = Bk < stopBk ? storedvals(B)[Bk] : zero(eltype(B))
+            Cx = f(Ax, Bx)
+            if Cx != fillvalue
+                for Ck::Int in (jo + 1):(jo + numrows(C))
+                    storedvals(C)[Ck] = Cx
+                end
+            end
+        end
+    elseif numrows(A) == 1 # && numrows(B) == numrows(C) != 1, vertically expand only A
         @inbounds for (j, jo) in zip(columns(C), _densecoloffsets(C))
             Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
             Bk, stopBk = numcols(B) == 1 ? (colstartind(B, 1), colboundind(B, 1)) : (colstartind(B, j), colboundind(B, j))
@@ -720,7 +749,7 @@ function _broadcast_notzeropres!{Tf}(f::Tf, fillvalue, C::SparseVecOrMat, A::Spa
                 end
             end
         end
-    elseif numrows(B) == 1 # && numrows(A) != 1, vertically expand second argument
+    else # numrows(B) == 1 && numrows(A) == numrows(C) != 1, vertically expand only B
         @inbounds for (j, jo) in zip(columns(C), _densecoloffsets(C))
             Ak, stopAk = numcols(A) == 1 ? (colstartind(A, 1), colboundind(A, 1)) : (colstartind(A, j), colboundind(A, j))
             Bk, stopBk = numcols(B) == 1 ? (colstartind(B, 1), colboundind(B, 1)) : (colstartind(B, j), colboundind(B, j))

diff --git a/test/sparse/higherorderfns.jl b/test/sparse/higherorderfns.jl
@@ -164,6 +164,7 @@ end
     mats = (sprand(N, M, p), sprand(N, 1, p), sprand(1, M, p), sprand(1, 1, 1.0), spzeros(1, 1))
     vecs = (sprand(N, p), sprand(1, 1.0), spzeros(1))
     tens = (mats..., vecs...)
+    fZ = Array(first(mats))
     for Xo in tens
         X = ndims(Xo) == 1 ? SparseVector{Float32,Int32}(Xo) : SparseMatrixCSC{Float32,Int32}(Xo)
         # use different types to check internal type stability via allocation tests below
@@ -182,17 +183,17 @@ end
                 @test_throws DimensionMismatch broadcast(+, spzeros((shapeX .- 1)...), Y)
             end
             # --> test broadcast! entry point / +-like zero-preserving op
-            fZ = broadcast(+, fX, fY); Z = sparse(fZ)
+            broadcast!(+, fZ, fX, fY); Z = sparse(fZ)
             broadcast!(+, Z, X, Y); Z = sparse(fZ) # warmup for @allocated
             @test (@allocated broadcast!(+, Z, X, Y)) == 0
             @test broadcast!(+, Z, X, Y) == sparse(broadcast!(+, fZ, fX, fY))
             # --> test broadcast! entry point / *-like zero-preserving op
-            fZ = broadcast(*, fX, fY); Z = sparse(fZ)
+            broadcast!(*, fZ, fX, fY); Z = sparse(fZ)
             broadcast!(*, Z, X, Y); Z = sparse(fZ) # warmup for @allocated
             @test (@allocated broadcast!(*, Z, X, Y)) == 0
             @test broadcast!(*, Z, X, Y) == sparse(broadcast!(*, fZ, fX, fY))
             # --> test broadcast! entry point / not zero-preserving op
-            fZ = broadcast(f, fX, fY); Z = sparse(fZ)
+            broadcast!(f, fZ, fX, fY); Z = sparse(fZ)
             broadcast!(f, Z, X, Y); Z = sparse(fZ) # warmup for @allocated
             @test (@allocated broadcast!(f, Z, X, Y)) == 0
             @test broadcast!(f, Z, X, Y) == sparse(broadcast!(f, fZ, fX, fY))