Enable bf16 output in TBE CPU kernel for other input types (#1851)

Summary: Pull Request resolved: #1851 Enable bf16 output support in TBE CPU kernel when the input weight type is int8/fp8/fp16/fp32 Differential Revision: D47028021 fbshipit-source-id: cc09c0c60cca6f4b72036d2e28e452f4f9334b5c
pytorch · Jul 5, 2023 · 6f779ef · 6f779ef
1 parent 8c3849e
commit 6f779ef
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 99 deletions.
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
@@ -229,9 +229,6 @@ for (const auto t : c10::irange(T)) {
                 // default to 1 byte alignment for CPU TBE
                 const int32_t D_bytes = nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment);
 
-                // NOTE: currently we only support bf16 output when input is int4 or int2
-                TORCH_CHECK(o_dtype != SparseType::BF16 || (o_dtype == SparseType::BF16 && (weight_ty == SparseType::INT4 || weight_ty == SparseType::INT2)));
-
                 int tt;
                 for (tt = t + 1; tt < T && weights_offsets_acc[tt] == weights_offsets_acc[t]; ++tt);
                 size_t num_rows = ((tt == T ? weight_tensor.numel() : weights_offsets_acc[tt]) - weights_offsets_acc[t]) / D_bytes;
@@ -268,10 +265,13 @@ for (const auto t : c10::irange(T)) {
                         {% endif %}
                         /*input_stride=*/D_bytes / sizeof(float),
                         {% if not nobag %}
-                        /*scale_bias_last=*/false);
+                        /*scale_bias_last=*/false,
+                        /*no_bag=*/false,
+                        /*is_bf16_out=*/output_is_bf16);
                         {% else %}
                         /*scale_bias_last=*/false,
-                        /*no_bag=*/true);
+                        /*no_bag=*/true,
+                        /*is_bf16_out=*/output_is_bf16);
                         {% endif %}
                     success = kernel(
                         {% if not nobag %}
@@ -301,10 +301,13 @@ for (const auto t : c10::irange(T)) {
                         {% endif %}
                         /*input_stride=*/D_bytes / sizeof(float16),
                         {% if not nobag %}
-                        /*scale_bias_last=*/false);
+                        /*scale_bias_last=*/false,
+                        /*no_bag=*/false,
+                        /*is_bf16_out=*/output_is_bf16);
                         {% else %}
                         /*scale_bias_last=*/false,
-                        /*no_bag=*/true);
+                        /*no_bag=*/true,
+                        /*is_bf16_out=*/output_is_bf16);
                         {% endif %}
                     success = kernel(
                         {% if not nobag %}
@@ -333,7 +336,8 @@ for (const auto t : c10::irange(T)) {
                         {% endif %}
                         /*input_stride=*/D_bytes / sizeof(uint8_t),
                         /*exponent_bits=*/fp8_exponent_bits,
-                        /*exponent_bias=*/fp8_exponent_bias);
+                        /*exponent_bias=*/fp8_exponent_bias,
+                        /*is_bf16_out=*/output_is_bf16);
                     success = kernel(
                         B,
                         index_size,
@@ -358,10 +362,13 @@ for (const auto t : c10::irange(T)) {
                         {% endif %}
                         /*input_stride=*/D_bytes / sizeof(uint8_t),
                         {% if not nobag %}
-                        /*scale_bias_last=*/false);
+                        /*scale_bias_last=*/false,
+                        /*no_bag=*/false,
+                        /*is_bf16_out=*/output_is_bf16);
                         {% else %}
                         /*scale_bias_last=*/false,
-                        /*no_bag=*/true);
+                        /*no_bag=*/true,
+                        /*is_bf16_out=*/output_is_bf16);
                         {% endif %}
                     success = kernel(
                         {% if not nobag %}

diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -4357,7 +4357,7 @@ def test_nbit_forward_cpu(
         )
 
     @given(
-        nbit_weights_ty=st.sampled_from([SparseType.INT4, SparseType.INT2]),
+        nbit_weights_ty=get_nbit_weights_ty(),
         use_array_for_index_remapping=st.booleans(),
         do_pruning=st.booleans(),
     )

diff --git a/include/fbgemm/FbgemmEmbedding.h b/include/fbgemm/FbgemmEmbedding.h
@@ -80,7 +80,8 @@ GenerateEmbeddingSpMDM(
     int prefetch = 16,
     bool is_weight_positional = false,
     bool use_offsets = true,
-    bool isbf16 = false);
+    bool is_bf16_out = false,
+    bool is_br16_in = false);
 
 /**
  * @param output_stride If -1, output_stride is same as block_size
@@ -112,7 +113,8 @@ GenerateEmbeddingSpMDMWithStrides(
     std::int64_t input_stride = -1,
     bool scale_bias_last = true,
     bool no_bag = false,
-    bool isbf16 = false);
+    bool is_bf16_out = false,
+    bool is_bf16_in = false);
 
 /**
  * @tparam IndexType can be int32_t or int64_t
@@ -195,7 +197,8 @@ GenerateEmbeddingSpMDMFP8WithStrides(
     std::int64_t output_stride = -1,
     std::int64_t input_stride = -1,
     int exponent_bits = 4,
-    int exponent_bias = 7);
+    int exponent_bias = 7,
+    bool is_bf16_out = false);
 
 template <
     typename InType,