fix: disable threading for certain devices

LuxDL · Oct 25, 2024 · 583a6b4 · 583a6b4
1 parent a8c0f3b
commit 583a6b4
Showing 1 changed file with 38 additions and 5 deletions.
diff --git a/src/impl/batched_mul.jl b/src/impl/batched_mul.jl
@@ -68,17 +68,17 @@ end
 function batched_matmul_loopvec_impl! end
 
 function fallback_batched_matmul(
-        dev, x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
+        opmode, x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
     z = similar(x, promote_type(eltype(x), eltype(y)), size(x, 1),
         size(y, 2), max(size(x, 3), size(y, 3)))
-    fallback_batched_matmul!(z, dev, x, y)
+    fallback_batched_matmul!(z, opmode, x, y)
     return z
 end
 
 function fallback_batched_matmul!(
-        z::AbstractArray{zT, 3}, dev, x::AbstractArray{xT, 3},
+        z::AbstractArray{zT, 3}, opmode, x::AbstractArray{xT, 3},
         y::AbstractArray{yT, 3}) where {zT, xT, yT}
-    @warn "Using fallback Batched Matrix Multiply routine for $(dev) with A: size = \
+    @warn "Using fallback Batched Matrix Multiply routine for $(opmode) with A: size = \
            $(size(x)) eltype = $(xT) and B: size = $(size(y)) eltype = $(yT). This may be \
            slow." maxlog=1
 
@@ -87,6 +87,36 @@ function fallback_batched_matmul!(
         throw(DimensionMismatch(lazy"size(x) = $(size(x)), size(y) = $(size(y)) inconsistent for batched_matmul."))
     end
 
+    if use_threaded_batched_matmul(get_device_type(x))
+        unsafe_fallback_threaded_batched_matmul!(z, x, y)
+    else
+        unsafe_fallback_serial_batched_matmul!(z, x, y)
+    end
+
+    return
+end
+
+function unsafe_fallback_serial_batched_matmul!(
+        z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
+        y::AbstractArray{yT, 3}) where {zT, xT, yT}
+    if size(x, 3) == size(y, 3)
+        for L in axes(z, 3)
+            mul!(batchview(z, L), batchview(x, L), batchview(y, L))
+        end
+    elseif size(x, 3) == 1
+        for L in axes(z, 3)
+            mul!(batchview(z, L), batchview(x, 1), batchview(y, L))
+        end
+    else # has to be size(y, 3) == 1
+        for L in axes(z, 3)
+            mul!(batchview(z, L), batchview(x, L), batchview(y, 1))
+        end
+    end
+end
+
+function unsafe_fallback_threaded_batched_matmul!(
+        z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
+        y::AbstractArray{yT, 3}) where {zT, xT, yT}
     old_threads = maybe_reduce_BLAS_threads(z)
 
     if size(x, 3) == size(y, 3)
@@ -104,10 +134,13 @@ function fallback_batched_matmul!(
     end
 
     reset_BLAS_threads(old_threads)
-
     return
 end
 
+use_threaded_batched_matmul(::Type) = false
+use_threaded_batched_matmul(::Type{CUDADevice}) = true
+use_threaded_batched_matmul(::Type{CPUDevice}) = true
+
 function CRC.rrule(::typeof(batched_matmul), x::AbstractArray{xT, 3},
         y::AbstractArray{yT, 3}) where {xT, yT}
     ∇batched_matmul = @closure Δ_ -> begin