openxla · wenscarl · Aug 24, 2023 · Sep 7, 2023 · Oct 17, 2023
diff --git a/xla/service/gpu/matmul_utils.cc b/xla/service/gpu/matmul_utils.cc
@@ -901,12 +901,16 @@ StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
       se::blas::ComputationType computation_type,
       GetBlasComputationType(lhs_layout.dtype, output_layout.dtype,
                              config.compute_precision));
-
+  // For FP8 matmuls, fast acumulation is only turned on when both operands's
+  // precision are DEFAULT.
+  bool fast_accum = (primitive_util::IsF8Type(lhs_layout.dtype) ||
+                     primitive_util::IsF8Type(rhs_layout.dtype)) &&
+                    config.compute_precision == 0;
   TF_ASSIGN_OR_RETURN(
       se::gpu::BlasLt::MatmulDesc op_desc,
       se::gpu::BlasLt::MatmulDesc::Create(
           computation_type, GetScaleType(output_dtype, computation_type),
-          trans_a, trans_b, epilogue));
+          trans_a, trans_b, epilogue, fast_accum));
 
   TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::MatrixLayout a_desc,
                       AsBlasLtMatrixLayout(lhs_layout));

diff --git a/xla/stream_executor/cuda/cuda_blas_lt.cc b/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -183,6 +183,8 @@ cudaDataType_t BlasLt::MatrixLayout::type() const {
                              AsCublasOperation(trans_b)));
   TF_ASSIGN_OR_RETURN(cublasLtEpilogue_t epi, AsCublasLtEpilogue(epilogue));
   TF_RETURN_IF_ERROR(SetAttr(cu_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, epi));
+  TF_RETURN_IF_ERROR(
+      SetAttr(cu_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, int8_t(fast_accum)));
   return std::move(desc);
 }
 

diff --git a/xla/stream_executor/cuda/cuda_blas_lt.h b/xla/stream_executor/cuda/cuda_blas_lt.h
@@ -93,7 +93,7 @@ class BlasLt {
         blas::ComputationType compute_type, blas::DataType scale_type,
         blas::Transpose trans_a = blas::Transpose::kNoTranspose,
         blas::Transpose trans_b = blas::Transpose::kNoTranspose,
-        Epilogue epilogue = Epilogue::kDefault,
+        Epilogue epilogue = Epilogue::kDefault, bool fast_accum = false,
         PointerMode pointer_mode = PointerMode::kHost);
 
     cublasComputeType_t compute_type() const;