Minor fix to nbit_device_with_spec

Summary: (1) fix the bandwidth calculation: T shouldn't be multiplied as we add each individual table's config (2) adjust zipf_oversample_ratio in request generation: previously, harcoded to 3, but it's too small if bag_size is small (though, the new value this diff sets is a bit arbitrary). (3) when generating requests, change alpha to 1.0 (even if user arg was > 1.0), if num_embeedings / bag_size < 2.0 -- zipf becomes too skewed and fails if this ratio is too low (again, 2.0 is a bit arbitrary) Differential Revision: D41370889 fbshipit-source-id: 242111a0f0b6cc7e4efb9b4097d3ecdd3a4957c0
pytorch · Nov 17, 2022 · 17a1b50 · 17a1b50
1 parent d0af623
commit 17a1b50
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 5 deletions.
diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
@@ -135,6 +135,7 @@ def generate_requests(
     # alpha <= 1.0: use uniform distribution
     # alpha > 1.0: use zipf distribution
     alpha: float = 1.0,
+    zipf_oversample_ratio: int = 3,
     weights_precision: SparseType = SparseType.FP32,
     weighted: bool = False,
     requests_data_file: Optional[str] = None,
@@ -221,7 +222,7 @@ def generate_requests(
         assert E >= L, "num-embeddings must be greater than equal to bag-size"
         # oversample and then remove duplicates to obtain sampling without
         # replacement
-        zipf_shape = (iters, T, B, 3 * L)
+        zipf_shape = (iters, T, B, zipf_oversample_ratio * L)
         if torch.cuda.is_available():
             zipf_shape_total_len = np.prod(zipf_shape)
             all_indices_list = []

diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -1274,15 +1274,15 @@ def nbit_device_with_spec(  # noqa C901
         read_write_bytes = sum(
             [
                 output_size_multiplier * B * d
-                + param_size_multiplier * B * T * bag_size * d
+                + param_size_multiplier * B * bag_size * d
                 for bag_size, d in zip(Ls, Ds)
             ]
         )
     else:
         read_write_bytes = sum(
             [
-                output_size_multiplier * B * T * bag_size * d
-                + param_size_multiplier * B * T * bag_size * d
+                output_size_multiplier * B * bag_size * d
+                + param_size_multiplier * B * bag_size * d
                 for bag_size, d in zip(Ls, Ds)
             ]
         )
@@ -1312,7 +1312,10 @@ def nbit_device_with_spec(  # noqa C901
                 bag_size,
                 e,
                 reuse=reuse,
-                alpha=alpha,
+                # don't use zipf if e isn't large enough compared to bag_size.
+                alpha=alpha if (e / bag_size) > 2.0 else 1.0,
+                # need many more samples for zipf if bag_size is very small.
+                zipf_oversample_ratio=3 if bag_size > 5 else 10,
                 weights_precision=weights_precision,
                 weighted=weighted,
             )