Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ifu 2022 07 28 #12

Merged
merged 15 commits into from
Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/fbgemmci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ jobs:
wget https://repo.radeon.com/amdgpu-install/22.10.1/ubuntu/focal/amdgpu-install_22.10.1.50101-1_all.deb
export DEBIAN_FRONTEND=noninteractive
sudo apt install -y ./amdgpu-install_22.10.1.50101-1_all.deb
amdgpu-install -y --usecase=hiplibsdk,rocm
amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
sudo rm amdgpu-install_22.10.1.50101-1_all.deb

- name: Install dependencies
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,10 @@ endif()

if(FBGEMM_BUILD_BENCHMARKS)
add_subdirectory(bench)
# add a flag to enable Clang 14
set_source_files_properties(
bench/GEMMsBenchmark.cc
PROPERTIES COMPILE_FLAGS "-Wno-unused-variable")
endif()

if(FBGEMM_BUILD_DOCS)
Expand Down
7 changes: 7 additions & 0 deletions fbgemm_gpu/bench/bench_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,15 @@ def benchmark_requests(
func: Callable[[Tensor, Tensor, Optional[Tensor]], Tensor],
flush_gpu_cache_size_mb: int = 0,
check_median: bool = False,
num_warmups: int = 0,
) -> float:
times = []

if num_warmups > 0:
indices, offsets, weights = requests[0]
for _ in range(num_warmups):
func(indices, offsets, weights)

if torch.cuda.is_available():
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
Expand Down
36 changes: 33 additions & 3 deletions fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def cli() -> None:
@click.option("--reuse", default=0.0)
@click.option("--row-wise/--no-row-wise", default=True)
@click.option("--weighted", is_flag=True, default=False)
@click.option("--pooling", type=str, default="sum")
@click.option("--weighted-num-requires-grad", type=int, default=None)
@click.option("--bounds-check-mode", type=int, default=BoundsCheckMode.NONE.value)
@click.option("--flush-gpu-cache-size-mb", default=0)
Expand All @@ -113,6 +114,7 @@ def device( # noqa C901
reuse: float,
row_wise: bool,
weighted: bool,
pooling: str,
weighted_num_requires_grad: Optional[int],
bounds_check_mode: int,
flush_gpu_cache_size_mb: int,
Expand Down Expand Up @@ -161,6 +163,17 @@ def device( # noqa C901
else:
managed_option = EmbeddingLocation.MANAGED

if pooling is None or pooling == "sum":
pooling = "sum"
pooling_mode = PoolingMode.SUM
do_pooling = True
elif pooling == "mean":
pooling_mode = PoolingMode.MEAN
do_pooling = True
else: # "none"
pooling_mode = PoolingMode.NONE
do_pooling = False

if dense:
emb = DenseTableBatchedEmbeddingBagsCodegen(
[
Expand All @@ -170,6 +183,7 @@ def device( # noqa C901
)
for d in Ds
],
pooling_mode=pooling_mode,
use_cpu=not torch.cuda.is_available(),
)
else:
Expand All @@ -191,6 +205,7 @@ def device( # noqa C901
weights_precision=weights_precision,
stochastic_rounding=stoc,
output_dtype=output_dtype,
pooling_mode=pooling_mode,
bounds_check_mode=BoundsCheckMode(bounds_check_mode),
)
emb = emb.to(get_device())
Expand All @@ -200,6 +215,18 @@ def device( # noqa C901

nparams = sum(w.numel() for w in emb.split_embedding_weights())
param_size_multiplier = weights_precision.bit_rate() / 8.0
output_size_multiplier = output_dtype.bit_rate() / 8.0
if do_pooling:
read_write_bytes = (
output_size_multiplier * B * sum(Ds)
+ param_size_multiplier * B * sum(Ds) * L
)
else:
read_write_bytes = (
output_size_multiplier * B * sum(Ds) * L
+ param_size_multiplier * B * sum(Ds) * L
)

logging.info(
f"Embedding parameters: {nparams / 1.0e9: .2f} GParam, "
f"{nparams * param_size_multiplier / 1.0e9: .2f} GB"
Expand Down Expand Up @@ -236,15 +263,18 @@ def device( # noqa C901
logging.info(
f"Forward, B: {B}, "
f"E: {E}, T: {T}, D: {D}, L: {L}, W: {weighted}, "
f"BW: {param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f} GB/s, " # noqa: B950
f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, " # noqa: B950
f"T: {time_per_iter * 1.0e6:.0f}us"
)

if output_dtype == SparseType.INT8:
# backward bench not representative
return

grad_output = torch.randn(B, sum(Ds)).to(get_device())
if do_pooling:
grad_output = torch.randn(B, sum(Ds)).to(get_device())
else:
grad_output = torch.randn(B * T * L, D).to(get_device())
# backward
time_per_iter = benchmark_requests(
requests,
Expand All @@ -258,7 +288,7 @@ def device( # noqa C901
)
logging.info(
f"ForwardBackward, B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, "
f"BW: {3 * param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f} GB/s, "
f"BW: {3 * read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
f"T: {time_per_iter * 1.0e6:.0f}us"
)

Expand Down
15 changes: 11 additions & 4 deletions fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,12 @@ class SplitLookupFunction_Dense_Op
using torch::autograd::Variable;

auto grad_output = grad_outputs[0];
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {

// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
grad_output = at::empty_like(grad_output).copy_(grad_output);
} else if (!grad_output.is_contiguous()) {
grad_output = grad_output.contiguous();
}

Expand Down Expand Up @@ -324,8 +328,11 @@ class SplitNoBagLookupFunction_Dense_Op
using torch::autograd::Variable;

auto grad_output = grad_outputs[0];
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
grad_output = at::empty_like(grad_output).copy_(grad_output);
} else if (!grad_output.is_contiguous()) {
grad_output = grad_output.contiguous();
}

Expand Down
8 changes: 5 additions & 3 deletions fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,9 +274,11 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
using torch::autograd::Variable;

auto grad_output = gradient_clipping ? clamp(grad_outputs[0], -max_gradient, max_gradient) : grad_outputs[0];
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
grad_output.stride(1) != 1 ||
grad_output.stride(0) % 4 != 0) {
// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
grad_output = at::empty_like(grad_output).copy_(grad_output);
} else if (!grad_output.is_contiguous()) {
grad_output = grad_output.contiguous();
}

Expand Down
110 changes: 110 additions & 0 deletions fbgemm_gpu/fbgemm_gpu/quantize_comm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


import logging
from typing import Optional

import torch

from fbgemm_gpu.quantize_utils import (
bf16_to_fp32,
fp16_to_fp32,
fp32_to_bf16_with_clamp,
fp32_to_fp16_with_clamp,
fp32_to_hfp8_with_clamp,
hfp8_to_fp32,
)
from fbgemm_gpu.split_embedding_configs import SparseType
from torch.autograd.profiler import record_function

logger: logging.Logger = logging.getLogger()


def _quantize_tensor(
input_tensor: torch.Tensor,
comm_precision: SparseType,
) -> torch.Tensor:
if comm_precision == SparseType.FP32:
return input_tensor
elif comm_precision == SparseType.FP16:
return fp32_to_fp16_with_clamp(input_tensor)
elif comm_precision == SparseType.BF16:
return fp32_to_bf16_with_clamp(input_tensor)
elif comm_precision == SparseType.FP8:
return fp32_to_hfp8_with_clamp(input_tensor)
else:
raise ValueError(f"comm_precision={comm_precision} is not supported")


def _dequantize_tensor(
quantized_tensor: torch.Tensor,
comm_precision: SparseType,
) -> torch.Tensor:
if comm_precision == SparseType.FP32:
assert quantized_tensor.dtype == torch.float
return quantized_tensor
elif comm_precision == SparseType.FP16:
assert quantized_tensor.dtype == torch.half
return fp16_to_fp32(quantized_tensor)
elif comm_precision == SparseType.BF16:
assert quantized_tensor.dtype == torch.bfloat16
return bf16_to_fp32(quantized_tensor)
elif comm_precision == SparseType.FP8:
assert quantized_tensor.dtype == torch.uint8
return hfp8_to_fp32(quantized_tensor)
else:
raise ValueError(f"comm_precision={comm_precision} is not supported")


class QuantizedCommCodec:
def __init__(
self,
comm_precision: SparseType,
loss_scale: Optional[float] = None,
) -> None:

if loss_scale is not None:
if comm_precision not in [SparseType.FP16, SparseType.BF16]:
logger.warning(
f"Setting loss scale for comm_precision={comm_precision} is not supported. Overriding to None"
)
loss_scale = None

logger.info(
f"Creating QuantizedCommsCodec comm_precision:{comm_precision}, loss_scale:{loss_scale}"
)

self._comm_precision = comm_precision
self._loss_scale = loss_scale

def encode(self, input_tensor: torch.Tensor) -> torch.Tensor:
if self._loss_scale is not None:
input_tensor = self._loss_scale * input_tensor
with record_function(
f"## encoder {self._comm_precision} {self._loss_scale} ##"
):
return _quantize_tensor(input_tensor, self._comm_precision)

def decode(self, input_grad: torch.Tensor) -> torch.Tensor:
if self._loss_scale is not None:
input_grad = input_grad / self._loss_scale
with record_function(
f"## decoder {self._comm_precision} {self._loss_scale} ##"
):
dequantized_tensor = _dequantize_tensor(input_grad, self._comm_precision)
return dequantized_tensor

@property
def quantized_dtype(self) -> torch.dtype:
if self._comm_precision == SparseType.FP16:
return torch.half
elif self._comm_precision == SparseType.BF16:
return torch.bfloat16
elif self._comm_precision == SparseType.FP8:
return torch.uint8
return torch.float
Loading