facebookresearch · urielsinger · Mar 27, 2023 · Mar 3, 2023 · Mar 4, 2023 · Mar 9, 2023
diff --git a/metaseq/cli/train.py b/metaseq/cli/train.py
@@ -315,23 +315,24 @@ def train(
         end_of_epoch = not itr.has_next()
         if end_of_epoch:
             grank = distributed_utils.get_global_rank()
+
+            log_seq = [f"End of Epoch on rank {grank}:"]
+            if hasattr(itr, "sequences_consumed"):
+                log_seq += [f"sequences_consumed={itr.sequences_consumed}"]
+            log_seq += [f"n={itr.n}"]
+
             dataset = epoch_itr.dataset
-            while not hasattr(dataset, "len_cache"):
+            while not hasattr(dataset, "len_cache") and hasattr(dataset, "dataset"):
                 dataset = dataset.dataset
-            len_cache = tuple(dataset.len_cache.data)
-            cache_hash = hash(len_cache)
-            contains_zero = any([x == 0 for x in len_cache])
-            logger.warning(
-                " ".join(
-                    [
-                        f"End of Epoch on rank {grank}:",
-                        f"sequences_consumed={itr.sequences_consumed}",
-                        f"n={itr.n}",
-                        f"len_cache_hash={cache_hash}",
-                        f"len_cache_has_zeros={contains_zero}",
-                    ]
-                )
-            )
+            if hasattr(dataset, "len_cache"):
+                len_cache = tuple(dataset.len_cache.data)
+                cache_hash = hash(len_cache)
+                contains_zero = any([x == 0 for x in len_cache])
+                log_seq += [
+                    f"len_cache_hash={cache_hash}",
+                    f"len_cache_has_zeros={contains_zero}",
+                ]
+            logger.warning(" ".join(log_seq))
 
         valid_losses, should_stop = validate_and_save(
             cfg,

diff --git a/metaseq/distributed/stitch_fsdp_ckpt.py b/metaseq/distributed/stitch_fsdp_ckpt.py
@@ -36,6 +36,7 @@ def consolidate_fsdp_shards(
     new_arch_name=None,
     no_stitch_megatron=False,
     megatron_part=None,
+    is_ema=False,
 ) -> str:
     if pth_prefix.endswith(".pt"):
         pth_prefix = pth_prefix[:-3]
@@ -68,7 +69,16 @@ def consolidate_fsdp_shards(
             expert_dest_paths.append(f"{save_prefix}-rank-{r}.pt")
         else:
             ckpt = load_and_pop_last_optimizer_state(p)
-            weights.append(ckpt["model"])
+            if "ema_fp32_params" in ckpt["extra_state"]:
+                ema_key = "ema_fp32_params"
+            elif "ema" in ckpt["extra_state"]:
+                ema_key = "ema"
+            else:
+                ema_key = None
+            if is_ema and ema_key is not None:
+                weights.append(ckpt["extra_state"][ema_key])
+            else:
+                weights.append(ckpt["model"])
             metadata.append(ckpt["shard_metadata"])
     assert weights, f"all files were considered experts: {all_ckpt_files}"
     do_consolidate = True
@@ -185,7 +195,7 @@ def consolidate_model_parallel(
         all_parts_consolidated[k] = part_weights
     if no_stitch_megatron:
         return all_parts_consolidated
-    # glue to be a single megatron mdoel part
+    # glue to be a single megatron model part
     model = reshard_megatron_parts(all_parts_consolidated, new_model_part_count=1)[0]
     return model
 

diff --git a/metaseq/models/ema/ema.py b/metaseq/models/ema/ema.py
@@ -76,7 +76,7 @@ def __init__(self, model, config, device=None):
         self.decay = config.ema_decay
         if isinstance(model, FullyShardedDataParallel):
             self.model = model
-            logger.warning("EMA got FSDP model, assuming assigned model is a " "copy")
+            logger.info("EMA got FSDP model, assuming assigned model is a " "copy")
         else:
             self.model = copy.deepcopy(model)
         self.model.requires_grad_(False)

diff --git a/metaseq/optim/base_optimizer.py b/metaseq/optim/base_optimizer.py
@@ -108,7 +108,9 @@ def multiply_grads(self, c):
                     c = c.to(p.grad.device)
                 p.grad.data.mul_(c)
 
-    def clip_grad_norm(self, max_norm, norm_type="l2", aggregate_norm_fn=None):
+    def clip_grad_norm(
+        self, max_norm, norm_type="l2", aggregate_norm_fn=None, **kwargs
+    ):
         """Clips gradient norm."""
         return utils.clip_grad_norm_(
             self.params, max_norm, norm_type, aggregate_norm_fn

diff --git a/metaseq/scripts/reshard_fsdp.py b/metaseq/scripts/reshard_fsdp.py
@@ -220,7 +220,7 @@ def reshard_fsdp_optim_state(
                 [_maybe_type(s["state"][idx][key], dtype) for s in shard_optim_states]
             )
             unpadded_value = _unpad_tensor(
-                tensor=unsharded_value,
+                shard=unsharded_value,
                 pad=shard_optim_padding.get(key, 0) if shard_optim_padding else 0,
             )
             chunks, _ = _shard_and_pad_tensor(unpadded_value, num_output_shards)

diff --git a/setup.py b/setup.py
@@ -196,7 +196,7 @@ def do_setup(package_data):
                 "albumentations",
                 "dalle_pytorch",
                 "einops",
-                "matplotlib",
+                "matplotlib==3.5.0",
                 "pytorchvideo==0.1.5",
                 "wandb",
                 "webdataset==0.1.103",