modify wmt model for dropout passing

priyakasimbeg · priyakasimbeg · commit 31f601977335 · 2025-05-15T22:19:20.000Z
diff --git a/algoperf/workloads/librispeech_deepspeech/librispeech_jax/models.py b/algoperf/workloads/librispeech_deepspeech/librispeech_jax/models.py
@@ -73,7 +73,7 @@ class Subsample(nn.Module):
   config: DeepspeechConfig
 
   @nn.compact
-  def __call__(self, inputs, output_paddings, train):
+  def __call__(self, inputs, output_paddings, train, dropout_rate=None):
     config = self.config
     outputs = jnp.expand_dims(inputs, axis=-1)
 
diff --git a/algoperf/workloads/wmt/wmt_jax/models.py b/algoperf/workloads/wmt/wmt_jax/models.py
@@ -236,7 +236,7 @@ def __call__(self, inputs, encoder_mask=None, dropout_rate=None):
 
         # MLP block.
         y = nn.LayerNorm(dtype=cfg.dtype)(x) if pre_ln else x
-        y = MlpBlock(config=cfg)(y)
+        y = MlpBlock(config=cfg)(y, dropout_rate=dropout_rate)
 
         return x + y if pre_ln else nn.LayerNorm(dtype=cfg.dtype)(x + y)
 
@@ -324,7 +324,7 @@ def __call__(
 
         # MLP block.
         z = nn.LayerNorm(dtype=cfg.dtype)(y) if pre_ln else y
-        z = MlpBlock(config=cfg)(z)
+        z = MlpBlock(config=cfg)(z, dropout_rate=dropout_rate)
 
         return y + z if pre_ln else nn.LayerNorm(dtype=cfg.dtype)(y + z)
 
@@ -382,7 +382,7 @@ def __call__(
 
         # Input Encoder
         for lyr in range(cfg.num_layers):
-            x = Encoder1DBlock(config=cfg, name=f"encoderblock_{lyr}")(x, encoder_mask)
+            x = Encoder1DBlock(config=cfg, name=f"encoderblock_{lyr}")(x, encoder_mask, dropout_rate)
 
         encoded = (
             nn.LayerNorm(dtype=cfg.dtype, name="encoder_layernorm")(x)
@@ -464,6 +464,7 @@ def __call__(
                 encoded,
                 decoder_mask=decoder_mask,
                 encoder_decoder_mask=encoder_decoder_mask,
+                dropout_rate=dropout_rate,
             )
         y = (
             nn.LayerNorm(dtype=cfg.dtype, name="encoderdecoder_layernorm")(y)
@@ -503,7 +504,7 @@ def setup(self):
     self.encoder = Encoder(config=cfg, shared_embedding=self.shared_embedding)
     self.decoder = Decoder(config=cfg, shared_embedding=self.shared_embedding)
 
-  def encode(self, inputs, inputs_positions=None, inputs_segmentation=None):
+  def encode(self, inputs, inputs_positions=None, inputs_segmentation=None, dropout_rate=None):
     """Applies Transformer encoder-branch on the inputs.
 
     Args:
@@ -528,7 +529,7 @@ def encode(self, inputs, inputs_positions=None, inputs_segmentation=None):
               jnp.equal,
               dtype=cfg.dtype))
     return self.encoder(
-        inputs, inputs_positions=inputs_positions, encoder_mask=encoder_mask)
+        inputs, inputs_positions=inputs_positions, encoder_mask=encoder_mask, dropout_rate=dropout_rate)
 
   def decode(
       self,
@@ -595,7 +596,8 @@ def __call__(self,
                inputs_positions=None,
                targets_positions=None,
                inputs_segmentation=None,
-               targets_segmentation=None):
+               targets_segmentation=None,
+               dropout_rate=None):
     """Applies Transformer model on the inputs.
 
     Args:
@@ -612,12 +614,14 @@ def __call__(self,
     encoded = self.encode(
         inputs,
         inputs_positions=inputs_positions,
-        inputs_segmentation=inputs_segmentation)
+        inputs_segmentation=inputs_segmentation,
+        dropout_rate=dropout_rate)
 
     return self.decode(
         encoded,
         inputs,  # only used for masks
         targets,
         targets_positions=targets_positions,
         inputs_segmentation=inputs_segmentation,
-        targets_segmentation=targets_segmentation)
+        targets_segmentation=targets_segmentation,
+        dropout_rate=dropout_rate)