AI-Hypercomputer
diff --git a/‎recml/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎recml/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recml/core/data/tf_dataset_factory.py‎
Lines changed: 11 additions & 6 deletions b/‎recml/core/data/tf_dataset_factory.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎recml/core/ops/hstu_ops.py‎
Lines changed: 8 additions & 8 deletions b/‎recml/core/ops/hstu_ops.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎recml/core/training/core.py‎
Lines changed: 29 additions & 25 deletions b/‎recml/core/training/core.py‎
Lines changed: 29 additions & 25 deletions
diff --git a/‎recml/core/training/jax_trainer.py‎
Lines changed: 14 additions & 36 deletions b/‎recml/core/training/jax_trainer.py‎
Lines changed: 14 additions & 36 deletions
@@ -38,3 +38,4 @@
 from recml.core.utils.types import Factory
 from recml.core.utils.types import FactoryProtocol
 from recml.core.utils.types import ObjectFactory
+from recml.layers.common import EmbeddingSpec
@@ -206,12 +206,13 @@ class TFDatasetFactory(types.Factory[tf.data.Dataset]):
       dataset. Defaults to `ShardingInfo(num_processes=jax.process_count(),
       process_index=jax.process_index())`. This is similar to `InputContext` in
       tensorflow.
+    cache_reading: Whether to cache the reading of the dataset. This is useful
+      for debugging and testing. Defaults to False.
     debug: An optional boolean indicating whether to debug input boundedness. If
       `True`, the dataset will consist of a single batch that's cached and
       infinitely repeated
   """
 
-  cache_reading: bool = False
   input_path: str | Sequence[str] = ""
   tfds_source: str | Sequence[str] = ""
   file_format: FileFormat = FileFormat.RECORDIO
@@ -246,6 +247,7 @@ class TFDatasetFactory(types.Factory[tf.data.Dataset]):
   sharding_info: DatasetShardingInfo = dataclasses.field(
       default_factory=DatasetShardingInfo
   )
+  cache_reading: bool = False
   debug: bool = False
 
   def __post_init__(self):
@@ -478,7 +480,7 @@ def _file_group_reader(file_group: str) -> tf.data.Dataset:
       )
 
     # Generate a tf.Example dataset by cycling through all uris in parallel.
-    return dataset.interleave(
+    dataset = dataset.interleave(
         map_func=reader,
         cycle_length=self.cycle_length,
         block_length=self.block_length,
@@ -490,6 +492,12 @@ def _file_group_reader(file_group: str) -> tf.data.Dataset:
         deterministic=self.deterministic,
     )
 
+    # Cache the reading of examples from files.
+    if self.cache_reading:
+      dataset = dataset.cache()
+
+    return dataset
+
   def _parse_dataset(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
     """Batches and parses an examples dataset."""
     # Batch the dataset to the global or per replica batch size.
@@ -556,10 +564,7 @@ def _maybe_apply_tf_data_service(
   def make(self) -> tf.data.Dataset:
     """Creates a `tf.data.Dataset` instance with all dataset ops applied."""
     # Create an examples dataset.
-    if self.cache_reading:
-      dataset = self._create_dataset().cache()
-    else:
-      dataset = self._create_dataset()
+    dataset = self._create_dataset()
     # Shuffle and repeat the dataset.
     dataset = self._maybe_shuffle_and_repeat(dataset)
     # Batch and parse the examples dataset.
 
@@ -125,9 +125,9 @@ def _apply_mask(
   masks = []
   if mask_ref is not None:
     if k_in_lanes:
-      mask = pl.load(mask_ref, (slice(None), k_slice))
+      mask = mask_ref[:, k_slice]
     else:
-      mask = pl.load(mask_ref, (k_slice, slice(None)))
+      mask = mask_ref[k_slice, :]
 
     snm = jnp.where(should_not_mask, 1, 0)
     masks.append(jnp.bitwise_or(mask, jnp.broadcast_to(snm, mask.shape)) != 0)
@@ -156,7 +156,7 @@ def _apply_mask(
       k_sequence = k_offset + jax.lax.broadcasted_iota(
           jnp.int32, (k_slice.size, bq), 0
       )
-      q_sequence = pl.load(q_sequence_ref, (pl.ds(1), slice(None)))  # [1, bq]
+      q_sequence = q_sequence_ref[:1, :]  # [1, bq]
       q_sequence = jnp.broadcast_to(q_sequence, (k_slice.size, bq))
 
     assert q_sequence.shape == k_sequence.shape
@@ -170,7 +170,7 @@ def _apply_mask(
 
   if q_segment_ids_ref is not None:
     if k_in_lanes:
-      kv_ids = pl.load(kv_segment_ids_ref, (pl.ds(1), k_slice))  # [1, k_slice]
+      kv_ids = kv_segment_ids_ref[:1, k_slice]  # [1, k_slice]
       repeats, rem = divmod(kv_ids.shape[1], NUM_LANES)
       if rem:
         raise NotImplementedError(f"block_kv must be a multiple of {NUM_LANES}")
@@ -181,9 +181,9 @@ def _apply_mask(
       if rem:
         raise NotImplementedError(f"block_q must be a multiple of {NUM_LANES}")
       kv_ids = pltpu.repeat(
-          pl.load(kv_segment_ids_ref, (k_slice, slice(None))), repeats, axis=1
+          kv_segment_ids_ref[k_slice, :], repeats, axis=1
       )  # [k_slice, bq]
-      q_ids = pl.load(q_segment_ids_ref, (pl.ds(1), slice(None)))  # [1, bq]
+      q_ids = q_segment_ids_ref[:1, :]  # [1, bq]
     masks.append(q_ids == kv_ids)
 
   if masks:
@@ -228,7 +228,7 @@ def body(kv_compute_index, _):
     slice_k = pl.ds(kv_compute_index * bkv_compute, bkv_compute)
 
     q = q_ref[...]
-    k = pl.load(k_ref, (slice_k, slice(None)))
+    k = k_ref[slice_k, :]
     qk = jax.lax.dot_general(
         q, k, NT_DIM_NUMBERS, preferred_element_type=jnp.float32
     )
@@ -256,7 +256,7 @@ def body(kv_compute_index, _):
     )
 
     sv_dims = NN_DIM_NUMBERS
-    v = pl.load(v_ref, (slice_k, slice(None)))
+    v = v_ref[slice_k, :]
 
     to_float32 = lambda x: x.astype(jnp.float32)
     v = to_float32(v)
 
@@ -57,6 +57,14 @@
 class Trainer(abc.ABC, Generic[TaskT]):
   """A base trainer interface for training and evaluation."""
 
+  class Mode(enum.StrEnum):
+    """Mode to run an experiment."""
+
+    TRAIN = "train"
+    EVAL = "eval"
+    TRAIN_AND_EVAL = "train_and_eval"
+    CONTINUOUS_EVAL = "continuous_eval"
+
   @abc.abstractmethod
   def __init__(self, model_dir: str, *args, **kwargs):
     """Initializes the instance."""
@@ -77,6 +85,23 @@ def train_and_evaluate(self, task: TaskT, *args, **kwargs) -> Logs | None:
   def evaluate_continuously(self, task: TaskT, *args, **kwargs) -> Logs | None:
     """Performs continuous evaluation until a condition is met."""
 
+  def run(self, task: TaskT, mode: Any) -> Logs | None:
+    """Runs the experiment in the given mode."""
+    if mode == Trainer.Mode.TRAIN_AND_EVAL:
+      return self.train_and_evaluate(task)
+    elif mode == Trainer.Mode.TRAIN:
+      return self.train(task)
+    elif mode == Trainer.Mode.EVAL:
+      return self.evaluate(task)
+    elif mode == Trainer.Mode.CONTINUOUS_EVAL:
+      return self.evaluate_continuously(task)
+    else:
+      raise ValueError(f"The job mode provided is not supported: {mode}.")
+
+  @classmethod
+  def setup(cls):
+    """Sets up the trainer before it is instantiated."""
+
 
 @dataclasses.dataclass(frozen=True)
 class Experiment(Generic[TaskT]):
@@ -90,32 +115,13 @@ class Experiment(Generic[TaskT]):
     trainer: The trainer to use for the experiment.
   """
 
-  class Mode(enum.StrEnum):
-    """Mode to run an experiment."""
-
-    TRAIN = "train"
-    EVAL = "eval"
-    TRAIN_AND_EVAL = "train_and_eval"
-    CONTINUOUS_EVAL = "continuous_eval"
-
   task: TaskT
   trainer: Trainer[TaskT]
 
 
-def run_experiment(
-    experiment: Experiment, mode: Experiment.Mode
-) -> Logs | None:
+def run_experiment(experiment: Experiment, mode: Any) -> Logs | None:
   """Runs an experiment."""
-  if mode == Experiment.Mode.TRAIN_AND_EVAL:
-    return experiment.trainer.train_and_evaluate(experiment.task)
-  elif mode == Experiment.Mode.TRAIN:
-    return experiment.trainer.train(experiment.task)
-  elif mode == Experiment.Mode.EVAL:
-    return experiment.trainer.evaluate(experiment.task)
-  elif mode == Experiment.Mode.CONTINUOUS_EVAL:
-    return experiment.trainer.evaluate_continuously(experiment.task)
-  else:
-    raise ValueError(f"The job mode provided is not supported: {mode}.")
+  experiment.trainer.run(experiment.task, mode)
 
 
 def get_iterators(
@@ -161,9 +167,7 @@ def get_iterators(
         k: iterator.TFDatasetIterator(v) for k, v in eval_datasets.items()
     }
 
-  if not all(
-      isinstance(v, iterator.Iterator) for v in eval_datasets.values()
-  ):
+  if not all(isinstance(v, iterator.Iterator) for v in eval_datasets.values()):
     raise ValueError(
         "Expected all values in the evaluation datasets mapping to be either"
         " `tf.data.Dataset` instances or CLU `DatasetIterator` instances,"
@@ -179,7 +183,7 @@ def get_shape(
   """Gets the shape of a dense / sparse / ragged tensor or tensor spec."""
   if isinstance(x, tf.SparseTensor):
     return [x.shape[0]] + [None for _ in x.shape[1:]]
-  return x.shape.as_list()
+  return x.shape.as_list()  # pylint: disable=attribute-error
 
 
 def in_tracing_context() -> bool:
 
@@ -45,7 +45,7 @@
 
 StateT = TypeVar("StateT")
 MetricsT = TypeVar("MetricsT", bound=Mapping[str, clu_metrics.Metric])
-MetaT = TypeVar("MetaT")
+ModelT = TypeVar("ModelT")
 PyTree = Any
 
 
@@ -61,7 +61,7 @@ def opt_state(self) -> optax.OptState:
     """Returns the optimizer state."""
 
 
-class JaxState(struct.PyTreeNode, Generic[MetaT]):
+class JaxState(struct.PyTreeNode, Generic[ModelT]):
   """A training state for a Jax model created using Flax / Haiku.
 
   Attributes:
@@ -77,7 +77,7 @@ class JaxState(struct.PyTreeNode, Generic[MetaT]):
     _apply: An optional function that can be used to apply the forward pass of
       the model. For Flax models this is usually set to `model.apply` while for
       Haiku models this is usually set to `transform.apply`.
-    _model: An optional reference to a stateless Flax model for convenience.
+    _model: An optional reference to a model for convenience.
     mutable: A pytree of mutable variables that are used by `apply`.
     meta: Arbitrary metadata that is recorded on the state. This can be useful
       for tracking additional references in the state.
@@ -88,14 +88,14 @@ class JaxState(struct.PyTreeNode, Generic[MetaT]):
   tx: optax.GradientTransformation = struct.field(pytree_node=False)
   opt_state: optax.OptState = struct.field(pytree_node=True)
   mutable: PyTree = struct.field(pytree_node=True, default_factory=dict)
-  meta: MetaT = struct.field(pytree_node=False, default_factory=dict)
+  meta: Any = struct.field(pytree_node=False, default_factory=dict)
   _apply: Callable[..., Any] | None = struct.field(
       pytree_node=False, default_factory=None
   )
-  _model: nn.Module | None = struct.field(pytree_node=False, default=None)
+  _model: ModelT | None = struct.field(pytree_node=False, default=None)
 
   @property
-  def model(self) -> nn.Module:
+  def model(self) -> ModelT:
     """Returns a reference to the model used to create the state."""
     if self._model is None:
       raise ValueError("No Flax `model` is set on the state.")
@@ -112,7 +112,7 @@ def create(
       cls,
       *,
       apply: Callable[..., Any] | None = None,
-      model: nn.Module | None = None,
+      model: ModelT | None = None,
       params: PyTree,
       tx: optax.GradientTransformation,
       **kwargs,
@@ -123,9 +123,8 @@ def create(
       apply: A function that can be used to apply the forward pass of the model.
         For Flax models this is usually set to `model.apply`. This cannot be set
         along with `model`.
-      model: A reference to a stateless Flax model. This cannot be set along
-        with `apply`. When set the `apply` attribute of the state will be set to
-        `model.apply`.
+      model: A reference to a model. This cannot be set along with `apply`. When
+        set the `apply` attribute of the state will be set to `model.apply`.
       params: A pytree of trainable variables that will be updated by `tx` and
         used in `apply`.
       tx: An optax gradient transformation that will be used to update the
@@ -137,7 +136,7 @@ def create(
     """
     if apply is not None and model is not None:
       raise ValueError("Only one of `apply` or `model` can be provided.")
-    elif model is not None:
+    elif model is not None and isinstance(model, nn.Module):
       apply = model.apply
 
     return cls(
@@ -311,30 +310,26 @@ def create_datasets(self) -> core.DatasetT:
     """
 
   @abc.abstractmethod
-  def create_state(self, batch: PyTree, rng: jax.Array) -> StateT:
+  def create_state(self, batch: PyTree) -> StateT:
     """Creates the training state.
 
     Args:
       batch: A pytree of arrays making up a dummy batch for state
         initialization.
-      rng: A prng key that is passed from the trainer to control randomness
-        during variable initialization.
 
     Returns:
       The state to use for training.
     """
 
   @abc.abstractmethod
   def train_step(
-      self, batch: PyTree, state: StateT, rng: jax.Array
+      self, batch: PyTree, state: StateT
   ) -> tuple[StateT, Mapping[str, clu_metrics.Metric]]:
     """Updates the training state and accumulates metrics.
 
     Args:
       batch: A pytree of arrays sampled from the training dataset.
       state: The training state created by `create_state`.
-      rng: A prng key that is passed from the trainer to control randomness
-        during training such as dropout.
 
     Returns:
       A tuple[state, metrics] where the state is the updated training state
@@ -396,8 +391,6 @@ def __init__(
       checkpoint_interval: int | None = None,
       max_checkpoints_to_keep: int = 5,
       continuous_eval_timeout: int = 30,
-      rng_seed: int = core.DEFAULT_RNG_SEED,
-      rng_impl: str | None = None,
   ):
     """Initializes the instance.
 
@@ -431,11 +424,6 @@ def __init__(
         checkpoint before timing out during continuous evaluation. When a
         timeout happens, the job will check for a marker file on disk and if it
         exists, it will terminate successfully. Defaults to 30 seconds.
-      rng_seed: The seed to use for the PRNG key. By default this is set to a
-        fixed constant.
-      rng_impl: The implementation of the PRNG key. By default this is set to
-        None which means that the default implementation (generally
-        partitionable threefry) will be used.
     """
 
     if not isinstance(steps_per_loop, int) or steps_per_loop < 1:
@@ -451,8 +439,6 @@ def __init__(
     self._continuous_eval_timeout = continuous_eval_timeout
     self._checkpoint_interval = checkpoint_interval or steps_per_loop
     self._max_checkpoints_to_keep = max_checkpoints_to_keep
-    self._rng_impl = rng_impl
-    self._rng_seed = rng_seed
 
   @functools.cached_property
   def checkpoint_manager(self) -> ocp.CheckpointManager:
@@ -610,18 +596,10 @@ def process_task(
   ]:
     """Initializes the objects required for training from the task."""
 
-    init_rng, step_rng = jax.random.split(
-        jax.random.key(self._rng_seed, impl=self._rng_impl)
-    )
-
-    def _create_state(inputs: PyTree) -> State:
-      return task.create_state(inputs, init_rng)
-
     def _train_step(
         inputs: PyTree, state: State
     ) -> tuple[State, Mapping[str, clu_metrics.Metric]]:
-      rng = jax.random.fold_in(step_rng, state.step)  # pytype: disable=attribute-error
-      state, metrics = task.train_step(inputs, state, rng)
+      state, metrics = task.train_step(inputs, state)
       return state, {**_state_metrics(state), **metrics}
 
     def _eval_step(
@@ -641,7 +619,7 @@ def _eval_step(
 
     sharded_abstract_batch = self._partitioner.shard_inputs(abstract_batch)
     init_fn = self._partitioner.partition_init(
-        _create_state, abstract_batch=sharded_abstract_batch
+        task.create_state, abstract_batch=sharded_abstract_batch
     )
     train_step = self._partitioner.partition_step(_train_step, training=True)
     eval_step = self._partitioner.partition_step(_eval_step)