From 37f34d49574cc107812f06678a8bbe04d84f33b1 Mon Sep 17 00:00:00 2001 From: Sacha Verweij Date: Sun, 8 Oct 2017 12:24:10 -0700 Subject: [PATCH] Improved sparse-dense multiplication kernels. --- base/sparse/linalg.jl | 132 +++++++++++++++++++++++++++++++++++++++--- test/sparse/sparse.jl | 55 +++++++++++++++++- 2 files changed, 178 insertions(+), 9 deletions(-) diff --git a/base/sparse/linalg.jl b/base/sparse/linalg.jl index 7ee6eebca020f..9dd1d3ea12465 100644 --- a/base/sparse/linalg.jl +++ b/base/sparse/linalg.jl @@ -42,9 +42,7 @@ function sppromote(A::SparseMatrixCSC{TvA,TiA}, B::SparseMatrixCSC{TvB,TiB}) whe A, B end - -### sparse-dense matrix multiplication: A[c|t]_mul_B[c|t][!]([dense,] sparse, dense) - +### gemm-like sparse-dense matrix multipliation: A[c|t]_mul_B[!](α, sparseA, denseB, β, denseC) # In matrix-vector multiplication, the correct orientation of the vector is assumed. for (f, op, transp) in ((:A_mul_B, :identity, false), (:Ac_mul_B, :adjoint, true), @@ -94,11 +92,129 @@ for (f, op, transp) in ((:A_mul_B, :identity, false), end end -# For compatibility with dense multiplication API. Should be deleted when dense multiplication -# API is updated to follow BLAS API. -A_mul_B!(C::StridedVecOrMat, A::SparseMatrixCSC, B::StridedVecOrMat) = A_mul_B!(one(eltype(B)), A, B, zero(eltype(C)), C) -Ac_mul_B!(C::StridedVecOrMat, A::SparseMatrixCSC, B::StridedVecOrMat) = Ac_mul_B!(one(eltype(B)), A, B, zero(eltype(C)), C) -At_mul_B!(C::StridedVecOrMat, A::SparseMatrixCSC, B::StridedVecOrMat) = At_mul_B!(one(eltype(B)), A, B, zero(eltype(C)), C) + +### sparse-dense matrix multiplication: A[c|t]_mul_B[c|t][!]([dense,] sparse, dense) + +# */A_mul_B[!]([dense,] sparse, dense) +function *(A::SparseMatrixCSC, B::StridedMatrix) + @boundscheck size(A, 2) == size(B, 1) || throw(DimensionMismatch()) + C = zeros(promote_type(eltype(A), eltype(B)), size(A, 1), size(B, 2)) + return unchecked_A_mul_B!(C, A, B) +end +function A_mul_B!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) + @boundscheck size(A, 2) == size(B, 1) || throw(DimensionMismatch()) + @boundscheck size(C, 1) == size(A, 1) || throw(DimensionMismatch()) + @boundscheck size(C, 2) == size(B, 2) || throw(DimensionMismatch()) + return unchecked_A_mul_B!(C, A, B) +end +function unchecked_A_mul_B!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) + nB = size(B, 2) + mB = size(B, 1) + fill!(C, zero(eltype(C))) + @inbounds for jB in 1:nB + for iB in 1:mB # iB == jA + xB = B[iB, jB] + @simd for kA in A.colptr[iB]:(A.colptr[iB+1] - 1) + iA = A.rowval[kA] + xA = A.nzval[kA] + C[iA, jB] = muladd(xA, xB, C[iA, jB]) + end + end + end + return C +end + +# A_mul_B(c|t)[!]([dense,] sparse, dense) +@propagate_inbounds A_mul_Bt(A::SparseMatrixCSC, B::StridedMatrix) = _A_mul_Bq(A, B, identity) +@propagate_inbounds A_mul_Bc(A::SparseMatrixCSC, B::StridedMatrix) = _A_mul_Bq(A, B, conj) +function _A_mul_Bq(A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + @boundscheck size(A, 2) == size(B, 2) || throw(DimensionMismatch()) + C = zeros(promote_type(eltype(A), eltype(B)), size(A, 1), size(B, 1)) + return unchecked_A_mul_Bq!(C, A, B, op) +end +@propagate_inbounds A_mul_Bt!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) = _A_mul_Bq!(C, A, B, identity) +@propagate_inbounds A_mul_Bc!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) = _A_mul_Bq!(C, A, B, conj) +function _A_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + @boundscheck size(A, 2) == size(B, 2) || throw(DimensionMismatch()) + @boundscheck size(C, 1) == size(A, 1) || throw(DimensionMismatch()) + @boundscheck size(C, 2) == size(B, 1) || throw(DimensionMismatch()) + return unchecked_A_mul_Bq!(C, A, B, op) +end +function unchecked_A_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + # Without some additional storage into which to reorder data prior to performing multiplication, + # memory access patterns for this operation aren't particularly happy. For now, perform the + # operation in two steps --- an explicit adjoint/transpose followed by an efficient multiplication. + # For the future, test the various possible access patterns to determine whether any best this + # approach all around, and if so replace this implementation. + return twostep_A_mul_Bq!(C, A, B, op) +end +twostep_A_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, ::typeof(identity)) = + unchecked_A_mul_B!(C, A, transpose(B)) +twostep_A_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, ::typeof(conj)) = + unchecked_A_mul_B!(C, A, adjoint(B)) + +# A(c|t)_mul_B[!]([dense,] sparse, dense) +@propagate_inbounds At_mul_B(A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_B(A, B, identity) +@propagate_inbounds Ac_mul_B(A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_B(A, B, conj) +function _Aq_mul_B(A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + @boundscheck size(A, 1) == size(B, 1) || throw(DimensionMismatch()) + C = zeros(promote_type(eltype(A), eltype(B)), size(A, 2), size(B, 2)) + return unchecked_Aq_mul_B!(C, A, B, op) +end +@propagate_inbounds At_mul_B!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_B!(C, A, B, identity) +@propagate_inbounds Ac_mul_B!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_B!(C, A, B, conj) +function _Aq_mul_B!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + @boundscheck size(A, 1) == size(B, 1) || throw(DimensionMismatch()) + @boundscheck size(C, 1) == size(A, 2) || throw(DimensionMismatch()) + @boundscheck size(C, 2) == size(B, 2) || throw(DimensionMismatch()) + return unchecked_Aq_mul_B!(C, A, B, op) +end +function unchecked_Aq_mul_B!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + nB = size(B, 2) + nA = size(A, 2) + fill!(C, zero(eltype(C))) + @inbounds for jB in 1:nB + for jA in 1:nA + CjAjB = zero(eltype(C)) + for kA in A.colptr[jA]:(A.colptr[jA+1] - 1) + iA = A.rowval[kA] + qxA = op(A.nzval[kA]) + CjAjB += qxA * B[iA, jB] + end + C[jA, jB] = CjAjB + end + end + return C +end + +# A(t|c)_mul_B(t|c)[!]([dense,] sparse, dense) +@propagate_inbounds At_mul_Bt(A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_Bq(A, B, identity) +@propagate_inbounds Ac_mul_Bc(A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_Bq(A, B, conj) +function _Aq_mul_Bq(A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + @boundscheck size(A, 1) == size(B, 2) || throw(DimensionMismatch()) + C = zeros(promote_type(eltype(A), eltype(B)), size(A, 2), size(B, 1)) + return unchecked_Aq_mul_Bq!(C, A, B, op) +end +@propagate_inbounds At_mul_Bt!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_Bq!(C, A, B, identity) +@propagate_inbounds Ac_mul_Bc!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix) = _Aq_mul_Bq!(C, A, B, conj) +function _Aq_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + @boundscheck size(A, 1) == size(B, 2) || throw(DimensionMismatch()) + @boundscheck size(C, 1) == size(A, 2) || throw(DimensionMismatch()) + @boundscheck size(C, 2) == size(B, 1) || throw(DimensionMismatch()) + return unchecked_Aq_mul_Bq!(C, A, B, op) +end +function unchecked_Aq_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, op::TF) where TF + # Without some additional storage into which to reorder data prior to performing multiplication, + # memory access patterns for this operation aren't particularly happy. For now, perform the + # operation in two steps --- a relatively efficient multiplication in reverse order + # followed by a transposition. For the future, test the various possible access patterns + # to determine whether any best this approach all around, and if so replace this implementation. + return twostep_Aq_mul_Bq!(C, A, B, op) +end +twostep_Aq_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, ::typeof(identity)) = + transpose!(C, *(B, A)) +twostep_Aq_mul_Bq!(C::StridedMatrix, A::SparseMatrixCSC, B::StridedMatrix, ::typeof(conj)) = + adjoint!(C, *(B, A)) ### dense-sparse matrix multiplication: A[c|t]_mul_B[c|t][!]([dense,] dense, sparse) diff --git a/test/sparse/sparse.jl b/test/sparse/sparse.jl index 81d2149e99b3f..b4522888002d4 100644 --- a/test/sparse/sparse.jl +++ b/test/sparse/sparse.jl @@ -2093,4 +2093,57 @@ end @test_throws DimensionMismatch Ac_mul_B!(Cnn, Axn, Snn) @test_throws DimensionMismatch At_mul_Bt!(Cnn, Ann, Snx) @test_throws DimensionMismatch Ac_mul_Bc!(Cnn, Ann, Snx) -end \ No newline at end of file +end + +@testset "sparse-dense matrix multiplication" begin + using Base.LinAlg: *, A_mul_B!, + A_mul_Bt, A_mul_Bt!, A_mul_Bc, A_mul_Bc!, + At_mul_B, At_mul_B!, Ac_mul_B, Ac_mul_B!, + At_mul_Bt, At_mul_Bt!, Ac_mul_Bc, Ac_mul_Bc! + # out-of-place sparse-dense ops, i.e. A[t|c]_mul_B[t|c](sparse, dense) + # + # exercise kernels, which are shared with corresponding in-place ops + for (m, k, n) in ((5, 5, 5), (5, 10, 15), (15, 10, 5)) + sparsemat = sprand(Complex{Float64}, m, k, 0.4) + densemat = rand(Complex{Float64}, k, n) + tdensemat = transpose(densemat) + tsparsemat = transpose(sparsemat) + @test *(sparsemat, densemat) ≈ *(Matrix(sparsemat), densemat) + @test A_mul_Bt(sparsemat, tdensemat) ≈ A_mul_Bt(Matrix(sparsemat), tdensemat) + @test A_mul_Bc(sparsemat, tdensemat) ≈ A_mul_Bc(Matrix(sparsemat), tdensemat) + @test At_mul_B(tsparsemat, densemat) ≈ At_mul_B(Matrix(tsparsemat), densemat) + @test Ac_mul_B(tsparsemat, densemat) ≈ Ac_mul_B(Matrix(tsparsemat), densemat) + @test At_mul_Bt(tsparsemat, tdensemat) ≈ At_mul_Bt(Matrix(tsparsemat), tdensemat) + @test Ac_mul_Bc(tsparsemat, tdensemat) ≈ Ac_mul_Bc(Matrix(tsparsemat), tdensemat) + end + # exercise inner-dimensions-match checks + n, x = 3, 4 + Cnn, Cxn, Cnx = zeros(n, n), zeros(x, n), zeros(n, x) + Ann, Axn, Anx = zeros(n, n), zeros(x, n), spzeros(n, x) + Snn, Sxn, Snx = spzeros(n, n), spzeros(x, n), spzeros(n, x) + @test_throws DimensionMismatch (*)(Snn, Axn) + @test_throws DimensionMismatch A_mul_Bt(Snn, Anx) + @test_throws DimensionMismatch A_mul_Bc(Snn, Anx) + @test_throws DimensionMismatch At_mul_B(Sxn, Ann) + @test_throws DimensionMismatch Ac_mul_B(Sxn, Ann) + @test_throws DimensionMismatch At_mul_Bt(Snn, Anx) + @test_throws DimensionMismatch Ac_mul_Bc(Snn, Anx) + + # in-place sparse-dense ops, i.e. A[t|c]_mul_B[t|c]!(dense, sparse, dense) + # the kernels were exercised through the out-of-place calls above, + # so below exercise only the entry points (shape checks) + # + # exercise matmul outer-dimensions-match checks + for op! in (A_mul_B!, A_mul_Bt!, A_mul_Bc!, At_mul_B!, Ac_mul_B!, At_mul_Bt!, Ac_mul_Bc!) + @test_throws DimensionMismatch op!(Cxn, Snn, Ann) + @test_throws DimensionMismatch op!(Cnx, Snn, Ann) + end + # exercise matul inner-dimensions-match checks + @test_throws DimensionMismatch A_mul_B!(Cnn, Snn, Axn) + @test_throws DimensionMismatch A_mul_Bt!(Cnn, Snn, Anx) + @test_throws DimensionMismatch A_mul_Bc!(Cnn, Snn, Anx) + @test_throws DimensionMismatch At_mul_B!(Cnn, Sxn, Ann) + @test_throws DimensionMismatch Ac_mul_B!(Cnn, Sxn, Ann) + @test_throws DimensionMismatch At_mul_Bt!(Cnn, Snn, Anx) + @test_throws DimensionMismatch Ac_mul_Bc!(Cnn, Snn, Anx) +end