ROCm · liligwu · Jul 28, 2022 · Jul 19, 2022 · Jul 20, 2022 · Jul 20, 2022
diff --git a/.github/workflows/fbgemmci.yml b/.github/workflows/fbgemmci.yml
@@ -267,7 +267,7 @@ jobs:
         wget https://repo.radeon.com/amdgpu-install/22.10.1/ubuntu/focal/amdgpu-install_22.10.1.50101-1_all.deb
         export DEBIAN_FRONTEND=noninteractive
         sudo apt install -y ./amdgpu-install_22.10.1.50101-1_all.deb
-        amdgpu-install -y --usecase=hiplibsdk,rocm
+        amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
         sudo rm amdgpu-install_22.10.1.50101-1_all.deb
 
     - name: Install dependencies

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -297,6 +297,10 @@ endif()
 
 if(FBGEMM_BUILD_BENCHMARKS)
   add_subdirectory(bench)
+  # add a flag to enable Clang 14
+  set_source_files_properties(
+    bench/GEMMsBenchmark.cc
+    PROPERTIES COMPILE_FLAGS "-Wno-unused-variable")
 endif()
 
 if(FBGEMM_BUILD_DOCS)

diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
@@ -244,8 +244,15 @@ def benchmark_requests(
     func: Callable[[Tensor, Tensor, Optional[Tensor]], Tensor],
     flush_gpu_cache_size_mb: int = 0,
     check_median: bool = False,
+    num_warmups: int = 0,
 ) -> float:
     times = []
+
+    if num_warmups > 0:
+        indices, offsets, weights = requests[0]
+        for _ in range(num_warmups):
+            func(indices, offsets, weights)
+
     if torch.cuda.is_available():
         torch.cuda.synchronize()
         start_event = torch.cuda.Event(enable_timing=True)

diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -91,6 +91,7 @@ def cli() -> None:
 @click.option("--reuse", default=0.0)
 @click.option("--row-wise/--no-row-wise", default=True)
 @click.option("--weighted", is_flag=True, default=False)
+@click.option("--pooling", type=str, default="sum")
 @click.option("--weighted-num-requires-grad", type=int, default=None)
 @click.option("--bounds-check-mode", type=int, default=BoundsCheckMode.NONE.value)
 @click.option("--flush-gpu-cache-size-mb", default=0)
@@ -113,6 +114,7 @@ def device(  # noqa C901
     reuse: float,
     row_wise: bool,
     weighted: bool,
+    pooling: str,
     weighted_num_requires_grad: Optional[int],
     bounds_check_mode: int,
     flush_gpu_cache_size_mb: int,
@@ -161,6 +163,17 @@ def device(  # noqa C901
     else:
         managed_option = EmbeddingLocation.MANAGED
 
+    if pooling is None or pooling == "sum":
+        pooling = "sum"
+        pooling_mode = PoolingMode.SUM
+        do_pooling = True
+    elif pooling == "mean":
+        pooling_mode = PoolingMode.MEAN
+        do_pooling = True
+    else:  # "none"
+        pooling_mode = PoolingMode.NONE
+        do_pooling = False
+
     if dense:
         emb = DenseTableBatchedEmbeddingBagsCodegen(
             [
@@ -170,6 +183,7 @@ def device(  # noqa C901
                 )
                 for d in Ds
             ],
+            pooling_mode=pooling_mode,
             use_cpu=not torch.cuda.is_available(),
         )
     else:
@@ -191,6 +205,7 @@ def device(  # noqa C901
             weights_precision=weights_precision,
             stochastic_rounding=stoc,
             output_dtype=output_dtype,
+            pooling_mode=pooling_mode,
             bounds_check_mode=BoundsCheckMode(bounds_check_mode),
         )
     emb = emb.to(get_device())
@@ -200,6 +215,18 @@ def device(  # noqa C901
 
     nparams = sum(w.numel() for w in emb.split_embedding_weights())
     param_size_multiplier = weights_precision.bit_rate() / 8.0
+    output_size_multiplier = output_dtype.bit_rate() / 8.0
+    if do_pooling:
+        read_write_bytes = (
+            output_size_multiplier * B * sum(Ds)
+            + param_size_multiplier * B * sum(Ds) * L
+        )
+    else:
+        read_write_bytes = (
+            output_size_multiplier * B * sum(Ds) * L
+            + param_size_multiplier * B * sum(Ds) * L
+        )
+
     logging.info(
         f"Embedding parameters: {nparams / 1.0e9: .2f} GParam, "
         f"{nparams * param_size_multiplier / 1.0e9: .2f} GB"
@@ -236,15 +263,18 @@ def device(  # noqa C901
     logging.info(
         f"Forward, B: {B}, "
         f"E: {E}, T: {T}, D: {D}, L: {L}, W: {weighted}, "
-        f"BW: {param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
+        f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
         f"T: {time_per_iter * 1.0e6:.0f}us"
     )
 
     if output_dtype == SparseType.INT8:
         # backward bench not representative
         return
 
-    grad_output = torch.randn(B, sum(Ds)).to(get_device())
+    if do_pooling:
+        grad_output = torch.randn(B, sum(Ds)).to(get_device())
+    else:
+        grad_output = torch.randn(B * T * L, D).to(get_device())
     # backward
     time_per_iter = benchmark_requests(
         requests,
@@ -258,7 +288,7 @@ def device(  # noqa C901
     )
     logging.info(
         f"ForwardBackward, B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, "
-        f"BW: {3 * param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f} GB/s, "
+        f"BW: {3 * read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
         f"T: {time_per_iter * 1.0e6:.0f}us"
     )
 

diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
@@ -174,8 +174,12 @@ class SplitLookupFunction_Dense_Op
     using torch::autograd::Variable;
 
     auto grad_output = grad_outputs[0];
-    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
-        grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
+
+    // FIXME: to support aligned memory access in Vec4T load/store function
+    // 16 for FP32 and 8 for FP16
+    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
+      grad_output = at::empty_like(grad_output).copy_(grad_output);
+    } else if (!grad_output.is_contiguous()) {
       grad_output = grad_output.contiguous();
     }
 
@@ -324,8 +328,11 @@ class SplitNoBagLookupFunction_Dense_Op
     using torch::autograd::Variable;
 
     auto grad_output = grad_outputs[0];
-    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
-        grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
+    // FIXME: to support aligned memory access in Vec4T load/store function
+    // 16 for FP32 and 8 for FP16
+    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
+      grad_output = at::empty_like(grad_output).copy_(grad_output);
+    } else if (!grad_output.is_contiguous()) {
       grad_output = grad_output.contiguous();
     }
 

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -274,9 +274,11 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
     using torch::autograd::Variable;
 
     auto grad_output = gradient_clipping ? clamp(grad_outputs[0], -max_gradient, max_gradient) : grad_outputs[0];
-    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
-        grad_output.stride(1) != 1 ||
-        grad_output.stride(0) % 4 != 0) {
+    // FIXME: to support aligned memory access in Vec4T load/store function
+    // 16 for FP32 and 8 for FP16
+    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
+        grad_output = at::empty_like(grad_output).copy_(grad_output);
+    } else if (!grad_output.is_contiguous()) {
         grad_output = grad_output.contiguous();
     }
 

diff --git a/fbgemm_gpu/fbgemm_gpu/quantize_comm.py b/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+from typing import Optional
+
+import torch
+
+from fbgemm_gpu.quantize_utils import (
+    bf16_to_fp32,
+    fp16_to_fp32,
+    fp32_to_bf16_with_clamp,
+    fp32_to_fp16_with_clamp,
+    fp32_to_hfp8_with_clamp,
+    hfp8_to_fp32,
+)
+from fbgemm_gpu.split_embedding_configs import SparseType
+from torch.autograd.profiler import record_function
+
+logger: logging.Logger = logging.getLogger()
+
+
+def _quantize_tensor(
+    input_tensor: torch.Tensor,
+    comm_precision: SparseType,
+) -> torch.Tensor:
+    if comm_precision == SparseType.FP32:
+        return input_tensor
+    elif comm_precision == SparseType.FP16:
+        return fp32_to_fp16_with_clamp(input_tensor)
+    elif comm_precision == SparseType.BF16:
+        return fp32_to_bf16_with_clamp(input_tensor)
+    elif comm_precision == SparseType.FP8:
+        return fp32_to_hfp8_with_clamp(input_tensor)
+    else:
+        raise ValueError(f"comm_precision={comm_precision} is not supported")
+
+
+def _dequantize_tensor(
+    quantized_tensor: torch.Tensor,
+    comm_precision: SparseType,
+) -> torch.Tensor:
+    if comm_precision == SparseType.FP32:
+        assert quantized_tensor.dtype == torch.float
+        return quantized_tensor
+    elif comm_precision == SparseType.FP16:
+        assert quantized_tensor.dtype == torch.half
+        return fp16_to_fp32(quantized_tensor)
+    elif comm_precision == SparseType.BF16:
+        assert quantized_tensor.dtype == torch.bfloat16
+        return bf16_to_fp32(quantized_tensor)
+    elif comm_precision == SparseType.FP8:
+        assert quantized_tensor.dtype == torch.uint8
+        return hfp8_to_fp32(quantized_tensor)
+    else:
+        raise ValueError(f"comm_precision={comm_precision} is not supported")
+
+
+class QuantizedCommCodec:
+    def __init__(
+        self,
+        comm_precision: SparseType,
+        loss_scale: Optional[float] = None,
+    ) -> None:
+
+        if loss_scale is not None:
+            if comm_precision not in [SparseType.FP16, SparseType.BF16]:
+                logger.warning(
+                    f"Setting loss scale for comm_precision={comm_precision} is not supported. Overriding to None"
+                )
+                loss_scale = None
+
+        logger.info(
+            f"Creating QuantizedCommsCodec comm_precision:{comm_precision}, loss_scale:{loss_scale}"
+        )
+
+        self._comm_precision = comm_precision
+        self._loss_scale = loss_scale
+
+    def encode(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        if self._loss_scale is not None:
+            input_tensor = self._loss_scale * input_tensor
+        with record_function(
+            f"## encoder {self._comm_precision} {self._loss_scale} ##"
+        ):
+            return _quantize_tensor(input_tensor, self._comm_precision)
+
+    def decode(self, input_grad: torch.Tensor) -> torch.Tensor:
+        if self._loss_scale is not None:
+            input_grad = input_grad / self._loss_scale
+        with record_function(
+            f"## decoder {self._comm_precision} {self._loss_scale} ##"
+        ):
+            dequantized_tensor = _dequantize_tensor(input_grad, self._comm_precision)
+        return dequantized_tensor
+
+    @property
+    def quantized_dtype(self) -> torch.dtype:
+        if self._comm_precision == SparseType.FP16:
+            return torch.half
+        elif self._comm_precision == SparseType.BF16:
+            return torch.bfloat16
+        elif self._comm_precision == SparseType.FP8:
+            return torch.uint8
+        return torch.float