q10
diff --git a/‎fbgemm_gpu/bench/tbe/tbe_training_benchmark.py
Lines changed: 32 additions & 53 deletions b/‎fbgemm_gpu/bench/tbe/tbe_training_benchmark.py
Lines changed: 32 additions & 53 deletions
diff --git a/‎fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
Lines changed: 14 additions & 13 deletions b/‎fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
Lines changed: 14 additions & 13 deletions
diff --git a/‎fbgemm_gpu/fbgemm_gpu/tbe/bench/__init__.py
Lines changed: 8 additions & 3 deletions b/‎fbgemm_gpu/fbgemm_gpu/tbe/bench/__init__.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎fbgemm_gpu/fbgemm_gpu/tbe/bench/embedding_ops_common_config.py
Lines changed: 151 additions & 0 deletions b/‎fbgemm_gpu/fbgemm_gpu/tbe/bench/embedding_ops_common_config.py
Lines changed: 151 additions & 0 deletions
@@ -19,10 +19,8 @@
 import torch
 from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType, SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
-    BoundsCheckMode,
     CacheAlgorithm,
     EmbeddingLocation,
-    str_to_embedding_location,
     str_to_pooling_mode,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
@@ -32,6 +30,7 @@
 )
 from fbgemm_gpu.tbe.bench import (
     benchmark_requests,
+    EmbeddingOpsCommonConfigLoader,
     TBEBenchmarkingConfigLoader,
     TBEDataConfigLoader,
 )
@@ -50,50 +49,39 @@ def cli() -> None:
 
 
 @cli.command()
-@click.option("--weights-precision", type=SparseType, default=SparseType.FP32)
-@click.option("--cache-precision", type=SparseType, default=None)
-@click.option("--stoc", is_flag=True, default=False)
-@click.option(
-    "--managed",
-    default="device",
-    type=click.Choice(["device", "managed", "managed_caching"], case_sensitive=False),
-)
 @click.option(
     "--emb-op-type",
     default="split",
     type=click.Choice(["split", "dense", "ssd"], case_sensitive=False),
+    help="The type of the embedding op to benchmark",
+)
+@click.option(
+    "--row-wise/--no-row-wise",
+    default=True,
+    help="Whether to use row-wise adagrad optimzier or not",
 )
-@click.option("--row-wise/--no-row-wise", default=True)
-@click.option("--pooling", type=str, default="sum")
-@click.option("--weighted-num-requires-grad", type=int, default=None)
-@click.option("--bounds-check-mode", type=int, default=BoundsCheckMode.NONE.value)
-@click.option("--output-dtype", type=SparseType, default=SparseType.FP32)
 @click.option(
-    "--uvm-host-mapped",
-    is_flag=True,
-    default=False,
-    help="Use host mapped UVM buffers in SSD-TBE (malloc+cudaHostRegister)",
+    "--weighted-num-requires-grad",
+    type=int,
+    default=None,
+    help="The number of weighted tables that require gradient",
 )
 @click.option(
-    "--ssd-prefix", type=str, default="/tmp/ssd_benchmark", help="SSD directory prefix"
+    "--ssd-prefix",
+    type=str,
+    default="/tmp/ssd_benchmark",
+    help="SSD directory prefix",
 )
 @click.option("--cache-load-factor", default=0.2)
 @TBEBenchmarkingConfigLoader.options
 @TBEDataConfigLoader.options
+@EmbeddingOpsCommonConfigLoader.options
 @click.pass_context
 def device(  # noqa C901
     context: click.Context,
     emb_op_type: click.Choice,
-    weights_precision: SparseType,
-    cache_precision: Optional[SparseType],
-    stoc: bool,
-    managed: click.Choice,
     row_wise: bool,
-    pooling: str,
     weighted_num_requires_grad: Optional[int],
-    bounds_check_mode: int,
-    output_dtype: SparseType,
-    uvm_host_mapped: bool,
     cache_load_factor: float,
     # SSD params
     ssd_prefix: str,
@@ -110,6 +98,9 @@ def device(  # noqa C901
     # Load TBE data configuration from cli arguments
     tbeconfig = TBEDataConfigLoader.load(context)
 
+    # Load common embedding op configuration from cli arguments
+    embconfig = EmbeddingOpsCommonConfigLoader.load(context)
+
     # Generate feature_requires_grad
     feature_requires_grad = (
         tbeconfig.generate_feature_requires_grad(weighted_num_requires_grad)
@@ -123,22 +114,8 @@ def device(  # noqa C901
     # Determine the optimizer
     optimizer = OptimType.EXACT_ROWWISE_ADAGRAD if row_wise else OptimType.EXACT_ADAGRAD
 
-    # Determine the embedding location
-    embedding_location = str_to_embedding_location(str(managed))
-    if embedding_location is EmbeddingLocation.DEVICE and not torch.cuda.is_available():
-        embedding_location = EmbeddingLocation.HOST
-
-    # Determine the pooling mode
-    pooling_mode = str_to_pooling_mode(pooling)
-
     # Construct the common split arguments for the embedding op
-    common_split_args: Dict[str, Any] = {
-        "weights_precision": weights_precision,
-        "stochastic_rounding": stoc,
-        "output_dtype": output_dtype,
-        "pooling_mode": pooling_mode,
-        "bounds_check_mode": BoundsCheckMode(bounds_check_mode),
-        "uvm_host_mapped": uvm_host_mapped,
+    common_split_args: Dict[str, Any] = embconfig.split_args() | {
         "optimizer": optimizer,
         "learning_rate": 0.1,
         "eps": 0.1,
@@ -154,7 +131,7 @@ def device(  # noqa C901
                 )
                 for d in Ds
             ],
-            pooling_mode=pooling_mode,
+            pooling_mode=embconfig.pooling_mode,
             use_cpu=not torch.cuda.is_available(),
         )
     elif emb_op_type == "ssd":
@@ -177,7 +154,7 @@ def device(  # noqa C901
                 (
                     tbeconfig.E,
                     d,
-                    embedding_location,
+                    embconfig.embedding_location,
                     (
                         ComputeDevice.CUDA
                         if torch.cuda.is_available()
@@ -187,25 +164,27 @@ def device(  # noqa C901
                 for d in Ds
             ],
             cache_precision=(
-                weights_precision if cache_precision is None else cache_precision
+                embconfig.weights_dtype
+                if embconfig.cache_dtype is None
+                else embconfig.cache_dtype
             ),
             cache_algorithm=CacheAlgorithm.LRU,
             cache_load_factor=cache_load_factor,
             **common_split_args,
         )
     embedding_op = embedding_op.to(get_device())
 
-    if weights_precision == SparseType.INT8:
+    if embconfig.weights_dtype == SparseType.INT8:
         # pyre-fixme[29]: `Union[(self: DenseTableBatchedEmbeddingBagsCodegen,
         #  min_val: float, max_val: float) -> None, (self:
         #  SplitTableBatchedEmbeddingBagsCodegen, min_val: float, max_val: float) ->
         #  None, Tensor, Module]` is not a function.
         embedding_op.init_embedding_weights_uniform(-0.0003, 0.0003)
 
     nparams = sum(d * tbeconfig.E for d in Ds)
-    param_size_multiplier = weights_precision.bit_rate() / 8.0
-    output_size_multiplier = output_dtype.bit_rate() / 8.0
-    if pooling_mode.do_pooling():
+    param_size_multiplier = embconfig.weights_dtype.bit_rate() / 8.0
+    output_size_multiplier = embconfig.output_dtype.bit_rate() / 8.0
+    if embconfig.pooling_mode.do_pooling():
         read_write_bytes = (
             output_size_multiplier * tbeconfig.batch_params.B * sum(Ds)
             + param_size_multiplier
@@ -225,7 +204,7 @@ def device(  # noqa C901
             * tbeconfig.pooling_params.L
         )
 
-    logging.info(f"Managed option: {managed}")
+    logging.info(f"Managed option: {embconfig.embedding_location}")
     logging.info(
         f"Embedding parameters: {nparams / 1.0e9: .2f} GParam, "
         f"{nparams * param_size_multiplier / 1.0e9: .2f} GB"
@@ -274,11 +253,11 @@ def _context_factory(on_trace_ready: Callable[[profile], None]):
         f"T: {time_per_iter * 1.0e6:.0f}us"
     )
 
-    if output_dtype == SparseType.INT8:
+    if embconfig.output_dtype == SparseType.INT8:
         # backward bench not representative
         return
 
-    if pooling_mode.do_pooling():
+    if embconfig.pooling_mode.do_pooling():
         grad_output = torch.randn(tbeconfig.batch_params.B, sum(Ds)).to(get_device())
     else:
         grad_output = torch.randn(
 
@@ -33,19 +33,20 @@ class EmbeddingLocation(enum.IntEnum):
     HOST = 3
     MTIA = 4
 
-
-def str_to_embedding_location(key: str) -> EmbeddingLocation:
-    lookup = {
-        "device": EmbeddingLocation.DEVICE,
-        "managed": EmbeddingLocation.MANAGED,
-        "managed_caching": EmbeddingLocation.MANAGED_CACHING,
-        "host": EmbeddingLocation.HOST,
-        "mtia": EmbeddingLocation.MTIA,
-    }
-    if key in lookup:
-        return lookup[key]
-    else:
-        raise ValueError(f"Cannot parse value into EmbeddingLocation: {key}")
+    @classmethod
+    # pyre-ignore[3]
+    def from_str(cls, key: str):
+        lookup = {
+            "device": EmbeddingLocation.DEVICE,
+            "managed": EmbeddingLocation.MANAGED,
+            "managed_caching": EmbeddingLocation.MANAGED_CACHING,
+            "host": EmbeddingLocation.HOST,
+            "mtia": EmbeddingLocation.MTIA,
+        }
+        if key in lookup:
+            return lookup[key]
+        else:
+            raise ValueError(f"Cannot parse value into EmbeddingLocation: {key}")
 
 
 class CacheAlgorithm(enum.Enum):
 
@@ -19,12 +19,17 @@
     benchmark_requests_refer,
     benchmark_vbe,
 )
-from .config import TBEDataConfig  # noqa F401
-from .config_loader import TBEDataConfigLoader  # noqa F401
-from .config_param_models import BatchParams, IndicesParams, PoolingParams  # noqa F401
+from .embedding_ops_common_config import EmbeddingOpsCommonConfigLoader  # noqa F401
 from .eval_compression import (  # noqa F401
     benchmark_eval_compression,
     EvalCompressionBenchmarkOutput,
 )
 from .reporter import BenchmarkReporter  # noqa F401
+from .tbe_data_config import TBEDataConfig  # noqa F401
+from .tbe_data_config_loader import TBEDataConfigLoader  # noqa F401
+from .tbe_data_config_param_models import (  # noqa F401
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+)
 from .utils import fill_random_scale_bias  # noqa F401
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import dataclasses
+from typing import Any, Dict, Optional
+
+import click
+import torch
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
+    BoundsCheckMode,
+    EmbeddingLocation,
+    PoolingMode,
+    str_to_pooling_mode,
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class EmbeddingOpsCommonConfig:
+    # Precision of the embedding weights
+    weights_dtype: SparseType
+    # Precision of the embedding cache
+    cache_dtype: Optional[SparseType]
+    # Precision of the embedding output
+    output_dtype: SparseType
+    # Enable stochastic rounding when performing quantization
+    stochastic_rounding: bool
+    # Pooling operation to perform
+    pooling_mode: PoolingMode
+    # Use host-mapped UVM buffers
+    uvm_host_mapped: bool
+    # Memory location of the embeddings
+    embedding_location: EmbeddingLocation
+    # Bounds check mode
+    bounds_check_mode: BoundsCheckMode
+
+    # pyre-ignore [3]
+    def validate(self):
+        return self
+
+    def split_args(self) -> Dict[str, Any]:
+        return {
+            "weights_precision": self.weights_dtype,
+            "stochastic_rounding": self.stochastic_rounding,
+            "output_dtype": self.output_dtype,
+            "pooling_mode": self.pooling_mode,
+            "bounds_check_mode": self.bounds_check_mode,
+            "uvm_host_mapped": self.uvm_host_mapped,
+        }
+
+
+class EmbeddingOpsCommonConfigLoader:
+    @classmethod
+    # pyre-ignore [2]
+    def options(cls, func) -> click.Command:
+        options = [
+            click.option(
+                "--emb-weights-dtype",
+                type=SparseType,
+                default=SparseType.FP32,
+                help="Precision of the embedding weights",
+            ),
+            click.option(
+                "--emb-cache-dtype",
+                type=SparseType,
+                default=None,
+                help="Precision of the embedding cache",
+            ),
+            click.option(
+                "--emb-output-dtype",
+                type=SparseType,
+                default=SparseType.FP32,
+                help="Precision of the embedding output",
+            ),
+            click.option(
+                "--emb-stochastic-rounding",
+                is_flag=True,
+                default=False,
+                help="Enable stochastic rounding when performing quantization",
+            ),
+            click.option(
+                "--emb-pooling-mode",
+                type=click.Choice(["sum", "mean", "none"], case_sensitive=False),
+                default="sum",
+                help="Pooling operation to perform",
+            ),
+            click.option(
+                "--emb-uvm-host-mapped",
+                is_flag=True,
+                default=False,
+                help="Use host-mapped UVM buffers",
+            ),
+            click.option(
+                "--emb-location",
+                default="device",
+                type=click.Choice(
+                    ["device", "managed", "managed_caching"], case_sensitive=False
+                ),
+                help="Memory location of the embeddings",
+            ),
+            click.option(
+                "--emb-bounds-check",
+                type=int,
+                default=BoundsCheckMode.WARNING.value,
+                help="Bounds check mode"
+                f"Available modes: FATAL={BoundsCheckMode.FATAL.value}, "
+                f"WARNING={BoundsCheckMode.WARNING.value}, "
+                f"IGNORE={BoundsCheckMode.IGNORE.value}, "
+                f"NONE={BoundsCheckMode.NONE.value}",
+            ),
+        ]
+
+        for option in reversed(options):
+            func = option(func)
+        return func
+
+    @classmethod
+    def load(cls, context: click.Context) -> EmbeddingOpsCommonConfig:
+        params = context.params
+
+        weights_dtype = params["emb_weights_dtype"]
+        cache_dtype = params["emb_cache_dtype"]
+        output_dtype = params["emb_output_dtype"]
+        stochastic_rounding = params["emb_stochastic_rounding"]
+        pooling_mode = str_to_pooling_mode(str(params["emb_pooling_mode"]))
+        uvm_host_mapped = params["emb_uvm_host_mapped"]
+        bounds_check_mode = BoundsCheckMode(params["emb_bounds_check"])
+
+        embedding_location = EmbeddingLocation.from_str(str(params["emb_location"]))
+        if (
+            embedding_location is EmbeddingLocation.DEVICE
+            and not torch.cuda.is_available()
+        ):
+            embedding_location = EmbeddingLocation.HOST
+
+        return EmbeddingOpsCommonConfig(
+            weights_dtype,
+            cache_dtype,
+            output_dtype,
+            stochastic_rounding,
+            pooling_mode,
+            uvm_host_mapped,
+            embedding_location,
+            bounds_check_mode,
+        ).validate()