Fully Sharded Data Parallel (#3740)

stephenroller · web-flow · commit 0b9afe8e6f5a · 2021-07-01T16:28:27.000-04:00
* Implement zero2 and zero3

* Implement overflow syncing.

* Tweak log statements.

* Use free ports rather than random ports

* Refactor test_distributed

* More refactor.

* Fixup checkpoints.

* Get tests working.

* GPU only

* Sigh

* Moar.

* Trying to sync grad norms

* Correctly implement gnorm syncing.

* Update comment.

* Try zero3.

* Okay got zero3 working.

* Refactor.

* Get FSDP Zero3 working, except during validation.

* Check in missing code. Carve out notimplemented.

* Lint.

* Er.

* Add a test to ensure we keep track of zero3 not working.

* Remove debugs, add docstrings, rename variable.

* Silly

* Reviewer comments.

* Lint.

* We disabled zero3 as an option, so don't need the test.

* Bug caught by Kurt.

* Rofl
diff --git a/parlai/agents/transformer/modules/decoder.py b/parlai/agents/transformer/modules/decoder.py
@@ -25,6 +25,7 @@
 from parlai.core.opt import Opt
 from parlai.utils.misc import warn_once
 from parlai.utils.torch import PipelineHelper
+from parlai.utils.fsdp import fsdp_wrap
 
 
 @swappable(
@@ -277,16 +278,15 @@ def _default(val, default):
     def build_layers(self) -> nn.ModuleList:
         layers = nn.ModuleList()
         for _ in range(self.n_layers):
-            layers.append(
-                self.swappables.layer(
-                    self.opt,
-                    attention_dropout=self.opt.get('attention_dropout', 0.0),
-                    relu_dropout=self.opt.get('relu_dropout', 0.0),
-                    dropout=self.opt.get('dropout', 0.0),
-                    activation=self.activation,
-                    variant=self.variant,
-                )  # type: ignore
+            layer = self.swappables.layer(
+                self.opt,
+                attention_dropout=self.opt.get('attention_dropout', 0.0),
+                relu_dropout=self.opt.get('relu_dropout', 0.0),
+                dropout=self.opt.get('dropout', 0.0),
+                activation=self.activation,
+                variant=self.variant,
             )
+            layers.append(fsdp_wrap(layer))  # type: ignore
         return layers
 
     def forward_embedding(
diff --git a/parlai/agents/transformer/modules/encoder.py b/parlai/agents/transformer/modules/encoder.py
@@ -25,6 +25,7 @@
 from parlai.core.opt import Opt
 from parlai.utils.misc import warn_once
 from parlai.utils.torch import PipelineHelper
+from parlai.utils.fsdp import fsdp_wrap
 
 
 @swappable(self_attention=MultiHeadAttention, feedforward=TransformerFFN)
@@ -227,16 +228,15 @@ def _default(val, default):
     def build_layers(self) -> nn.ModuleList:
         layers = nn.ModuleList()
         for _ in range(self.n_layers):
-            layers.append(
-                self.swappables.layer(  # type: ignore
-                    self.opt,
-                    attention_dropout=self.opt.get('attention_dropout', 0.0),
-                    relu_dropout=self.opt.get('relu_dropout', 0.0),
-                    dropout=self.dropout_frac,
-                    variant=self.variant,
-                    activation=self.activation,
-                )
+            layer = self.swappables.layer(  # type: ignore
+                self.opt,
+                attention_dropout=self.opt.get('attention_dropout', 0.0),
+                relu_dropout=self.opt.get('relu_dropout', 0.0),
+                dropout=self.dropout_frac,
+                variant=self.variant,
+                activation=self.activation,
             )
+            layers.append(fsdp_wrap(layer))
         return layers
 
     def forward_embedding(
diff --git a/parlai/core/params.py b/parlai/core/params.py
@@ -772,6 +772,16 @@ def add_distributed_training_args(self):
         grp.add_argument(
             '--distributed-world-size', type=int, help='Number of workers.'
         )
+        grp.add_argument(
+            '--ddp-backend',
+            # TODO: add in zero3. https://github.com/facebookresearch/ParlAI/issues/3753
+            choices=['ddp', 'zero2'],
+            default='ddp',
+            help=(
+                'Distributed backend. Zero2 can be faster but is more experimental. '
+                'DDP is the most tested.'
+            ),
+        )
         return grp
 
     def add_model_args(self):
diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py
@@ -36,6 +36,7 @@
 from parlai.utils.distributed import is_distributed
 from parlai.utils.misc import AttrDict, warn_once
 from parlai.utils.io import PathManager
+from parlai.utils.fsdp import should_sync_gradnorm, is_fsdp, DEFAULT_DDP_BACKEND
 from parlai.utils.fp16 import (
     SafeFP16Optimizer,
     MemoryEfficientFP16Optimizer,
@@ -1052,7 +1053,9 @@ def init_optim(
         self.optimizer = optim_class(params, **kwargs)
         if self.fp16:
             if self.fp16_impl == 'safe':
-                self.optimizer = SafeFP16Optimizer(self.optimizer)
+                self.optimizer = SafeFP16Optimizer(
+                    self.optimizer, should_sync_gradnorm(opt)
+                )
             else:
                 # Using memory efficient optimizer
                 opt_name = opt['optimizer']
@@ -1064,7 +1067,9 @@ def init_optim(
                         'with Memory Efficient FP16. Please select from among this '
                         f'list:\n{compatible_list}'
                     )
-                self.optimizer = MemoryEfficientFP16Optimizer(self.optimizer)
+                self.optimizer = MemoryEfficientFP16Optimizer(
+                    self.optimizer, should_sync_gradnorm(opt)
+                )
 
         if is_finetune:
             logging.warning('Detected a fine-tune run. Resetting the optimizer.')
@@ -1969,10 +1974,11 @@ def state_dict(self):
         """
         states = {}
         if hasattr(self, 'model'):  # save model params
-            if hasattr(self.model, 'module'):
-                # did we wrap in a DistributedDataParallel
+            if hasattr(self.model, 'module') and not is_fsdp(self.model):
+                # did we wrap in a DistributedDataParallel or DataParallel
                 states['model'] = self.model.module.state_dict()
             else:
+                # regular model or FSDP
                 states['model'] = self.model.state_dict()
 
         if hasattr(self, 'optimizer'):
@@ -1992,6 +1998,16 @@ def state_dict(self):
 
         return states
 
+    def save_nonprimary(self, path=None):
+        """
+        Save model parameters, when you are working on the non-primary worker.
+
+        For models or optimizers that shard parameters, this ensures we sync.
+        """
+        if self.opt.get('ddp_backend', DEFAULT_DDP_BACKEND) in ('zero2', 'zero3'):
+            # make sure we call the state dict
+            self.state_dict()
+
     def save(self, path=None):
         """
         Save model parameters to path (or default to model_file arg).
diff --git a/parlai/core/torch_generator_agent.py b/parlai/core/torch_generator_agent.py
@@ -35,6 +35,7 @@
 import parlai.utils.logging as logging
 from parlai.core.metrics import SumMetric, AverageMetric, FairseqBleuMetric
 from parlai.utils.fp16 import FP16SafeCrossEntropy
+import parlai.utils.fsdp as fsdp_utils
 from parlai.utils.torch import (
     neginf,
     total_parameters,
@@ -479,8 +480,10 @@ def __init__(self, opt: Opt, shared=None):
         else:
             # this is not a shared instance of this class, so do full init
             self.criterion = self.build_criterion()
-            # ensure all distributed copies will always be in sync
-            self.model = self.build_model()
+            with fsdp_utils.maybe_fsdp_wrap(opt):
+                self.model = fsdp_utils.fsdp_wrap(self.build_model())
+                if self.fp16 and not fsdp_utils.delay_halving(opt):
+                    self.model = self.model.half()
 
             # load the block_list for beam search
             self.beam_block_list = self._load_beam_block_list()
@@ -498,16 +501,15 @@ def __init__(self, opt: Opt, shared=None):
                     self.model.cuda()
                 self.criterion.cuda()
 
-            sync_parameters(self.model)
+            if not fsdp_utils.is_fsdp(self.model):
+                sync_parameters(self.model)
+
             train_params = trainable_parameters(self.model)
             total_params = total_parameters(self.model)
             logging.info(
                 f"Total parameters: {total_params:,d} ({train_params:,d} trainable)"
             )
 
-            if self.fp16:
-                self.model = self.model.half()
-
             if init_model is not None:
                 # load model parameters if available
                 logging.info(f'Loading existing model params from {init_model}')
@@ -530,7 +532,11 @@ def __init__(self, opt: Opt, shared=None):
                 logging.warning("Optimizer was reset. Also resetting LR scheduler.")
             self.build_lr_scheduler(states, hard_reset=is_finetune or was_reset)
 
-        if shared is None and is_distributed():
+        if (
+            shared is None
+            and is_distributed()
+            and opt.get('ddp_backend', fsdp_utils.DEFAULT_DDP_BACKEND) == 'ddp'
+        ):
             device_ids = None if self.model_parallel else [self.opt['gpu']]
             self.model = torch.nn.parallel.DistributedDataParallel(
                 self.model, device_ids=device_ids, broadcast_buffers=False
diff --git a/parlai/scripts/multiprocessing_eval.py b/parlai/scripts/multiprocessing_eval.py
@@ -23,7 +23,6 @@
 """
 
 import torch
-import random
 import os
 import signal
 import parlai.utils.distributed as distributed_utils
@@ -88,7 +87,7 @@ def setup_args(cls):
         return setup_args()
 
     def run(self):
-        port = random.randint(32000, 48000)
+        port = distributed_utils.find_free_port()
         return launch_and_eval(self.opt, port)
 
 
diff --git a/parlai/scripts/multiprocessing_train.py b/parlai/scripts/multiprocessing_train.py
@@ -24,7 +24,6 @@
 """
 
 import torch
-import random
 import os
 import signal
 import traceback
@@ -55,10 +54,12 @@ def multiprocess_train(
             raise
 
 
-def launch_and_train(opt, port):
+def launch_and_train(opt, port=None):
     """
     Perform a fork() to many processes.
     """
+    if port is None:
+        port = distributed_utils.find_free_port()
     # Launch multiple subprocesses
     spawncontext = torch.multiprocessing.start_processes(
         multiprocess_train,
@@ -99,7 +100,7 @@ def setup_args(cls):
 
     def run(self):
         if self.opt['port'] is None:
-            port = random.randint(32000, 48000)
+            port = None
         else:
             port = self.opt['port']
         return launch_and_train(self.opt, port)
diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py
@@ -442,17 +442,20 @@ def save_model(self, suffix=None):
         """
         Save the model to disk, possibly with a suffix.
         """
-        if not is_primary_worker():
-            # never do IO as a non-primary worker
-            return
-
         if not self.opt.get('model_file'):
             # nothing to save to, just exit
             return
 
         fn = self.opt['model_file']
         if suffix:
             fn += suffix
+
+        if not is_primary_worker():
+            # never do IO as a non-primary worker
+            if hasattr(self.agent, 'save_nonprimary'):
+                self.agent.save_nonprimary(fn)
+            return
+
         while True:
             # don't ever let a ctrl-c interrupt saving
             try:
@@ -543,7 +546,7 @@ def validate(self):
             )
             self.best_valid = new_valid
             self.impatience = 0
-            if opt.get('model_file') and is_primary_worker():
+            if opt.get('model_file'):
                 logging.info(f"saving best valid model: {opt['model_file']}")
                 self.save_model()
                 self.saved = True
@@ -566,11 +569,7 @@ def validate(self):
         self.validate_time.reset()
 
         # saving
-        if (
-            opt.get('model_file')
-            and opt.get('save_after_valid')
-            and is_primary_worker()
-        ):
+        if opt.get('model_file') and opt.get('save_after_valid'):
             logging.info(f"saving model checkpoint: {opt['model_file']}.checkpoint")
             self.save_model('.checkpoint')
 
@@ -720,24 +719,26 @@ def _get_time(self, world: World) -> Tuple[float, float, float]:
             self._total_epochs = self._preempted_epochs + sum(
                 all_gather_list(world.get_total_epochs())
             )
-            train_time, log_time, validate_time = sync_object(
+            train_time, log_time, validate_time, save_time = sync_object(
                 (
                     self.train_time.time(),
                     self.log_time.time(),
                     self.validate_time.time(),
+                    self.save_time.time(),
                 )
             )
         else:
-            train_time, log_time, validate_time = (
+            train_time, log_time, validate_time, save_time = (
                 self.train_time.time(),
                 self.log_time.time(),
                 self.validate_time.time(),
+                self.save_time.time(),
             )
             self._total_epochs = self._preempted_epochs + (
                 num_workers() * world.get_total_epochs()
             )
 
-        return train_time, log_time, validate_time
+        return train_time, log_time, validate_time, save_time
 
     def log(self):
         """
@@ -810,7 +811,7 @@ def train_steps(self):
                 self._last_log_steps += 1 / self.update_freq
 
                 # the following additionally updates self._total_epochs
-                train_time, log_time, validate_time = self._get_time(world)
+                train_time, log_time, validate_time, save_time = self._get_time(world)
                 # get the total training examples done, compute epochs
                 exs_per_epoch = world.num_examples()
                 self._total_exs = int(np.round(self._total_epochs * exs_per_epoch))
@@ -859,11 +860,7 @@ def train_steps(self):
                         break
                     # make sure metrics are clean before we log
                     world.reset_metrics()
-                if (
-                    self.save_time.time() > self.save_every_n_secs
-                    and opt.get('model_file')
-                    and is_primary_worker()
-                ):
+                if save_time > self.save_every_n_secs and opt.get('model_file'):
                     logging.info(
                         f"saving model checkpoint: {opt['model_file']}.checkpoint"
                     )
@@ -872,7 +869,7 @@ def train_steps(self):
                     self.save_model('.checkpoint')
                     self.save_time.reset()
 
-        if not self.saved and is_primary_worker():
+        if not sync_object(self.saved):
             # save agent
             self.save_model()
 
diff --git a/parlai/utils/distributed.py b/parlai/utils/distributed.py
@@ -296,6 +296,19 @@ def distributed_context(
             dist.destroy_process_group()
 
 
+def get_dist_group():
+    """
+    Find the default pytorch distributed group.
+
+    Used within FSDP to mark which workers are participating. Important to manually call
+    this because FSDP will cache old groups, but our test suite will instantiate new
+    groups per test.
+    """
+    from torch.distributed.distributed_c10d import _get_default_group
+
+    return _get_default_group()
+
+
 @contextlib.contextmanager
 def slurm_distributed_context(opt):
     """
@@ -346,3 +359,15 @@ def slurm_distributed_context(opt):
     except FileNotFoundError:
         # Slurm is not installed
         raise RuntimeError('SLURM does not appear to be installed.')
+
+
+def find_free_port() -> int:
+    """
+    Find a free port we can bind to locally.
+
+    Credit: https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number
+    """
+    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(('', 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
diff --git a/parlai/utils/fp16.py b/parlai/utils/fp16.py
diff --git a/parlai/utils/fsdp.py b/parlai/utils/fsdp.py
diff --git a/tests/test_distributed.py b/tests/test_distributed.py

-Original file line number
+Diff line change
+        )
 -def clip_grad_norm(params, max_norm):
 +def clip_grad_norm(params, max_norm: float = 0, sync: bool = False):
     """
 -    Clips grad norm.
 +    Clips grad norms.
++
 +    During combination with FSDP, will also ensure that grad norms are aggregated
 +    across all workers, since each worker only stores their shard of the
 +    gradients.
++
 +    :param params:
 +        Parameters whose gradients we wish to clip
 +    :param max_norm:
 +        Maximum norm we wish the gradients to have. If non-positive, then
 +        we will not perform clipping.
 +    :param sync:
 +        Boolean indicating whether we should aggregate across the distributed
 +        group. Used only in combination with FSDP.
++
 +    :returns:
 +        The gradient norm across all parameters, before clipping.
     """
     if isinstance(params, torch.Tensor):
         params = [params]
     # make sure any generators are expanded
     params = list(params)
 -    if len(params) == 1:
 -        p = params[0].grad
 -        grad_norm = torch.norm(p)
 -        if grad_norm > max_norm > 0:
 -            clip_coef = max_norm / (grad_norm + 1e-6)
 -            p.mul_(clip_coef)
 -        return grad_norm
 -    elif max_norm > 0:
 +    # if syncing we need to manually perform the clipping so that we aggregrate
 +    # properly
 +    if max_norm > 0 and not sync:
         return torch.nn.utils.clip_grad_norm_(params, max_norm)
     else:
 -        return torch.sqrt(
 -            sum(p.grad.data.norm() ** 2 for p in params if p.grad is not None)
 -        )
 +        normsq = sum(p.grad.data.norm() ** 2 for p in params if p.grad is not None)
 +        if sync:
 +            # also need to get the norms from all the other sharded works in FSDP
 +            import torch.distributed as dist
++
 +            dist.all_reduce(normsq)
 +        grad_norm = normsq.sqrt()
 +        if max_norm > 0:
 +            clip_coef = max_norm / (grad_norm + 1e-6)
 +            for p in params:
 +                p.grad.detach().mul_(clip_coef)
++
 +        return grad_norm
 def has_overflow(grad_norm):
 class SafeFP16Optimizer(torch.optim.Optimizer):
 -    def __init__(self, optimizer):
 +    def __init__(self, optimizer, aggregate_gnorms=False):
         self.fp16_params = self._get_parameters(optimizer)
         self.fp32_params = self._build_fp32_params(self.fp16_params, flatten=False)
         self.optimizer = optimizer
         self.scaler = DynamicLossScaler(2.0 ** 15)
         self.min_loss_scale = 2 ** -5
 +        self._aggregate_gnorms = aggregate_gnorms
     @classmethod
     def _get_parameters(cls, optimizer):
         Clips gradient norm and updates dynamic loss scaler.
         """
         self._sync_fp16_grads_to_fp32()
 -        grad_norm = clip_grad_norm(self.fp32_params, max_norm)
 +        grad_norm = clip_grad_norm(
 +            self.fp32_params, max_norm, sync=self._aggregate_gnorms
 +        )
         # detect overflow and adjust loss scale
         if self.scaler is not None:
     def __init__(
         self,
         init_optimizer: torch.optim.Optimizer,  # type: ignore
 +        aggregate_gnorms: bool = False,
         loss_initial_scale: float = 2.0 ** 17,
         min_loss_scale: float = 1e-4,
     ):
         self.min_loss_scale = min_loss_scale
         self.scaler = DynamicLossScaler(init_scale=loss_initial_scale)
 +        self._aggregate_gnorms = aggregate_gnorms
++
     @staticmethod
     def compatible_optimizers():
         """
         Returns -1 if the most recently computed gradients overflowed.
         """
         self._unscale_grads()
 -        grad_norm = clip_grad_norm(self.params, gradient_clip)
 +        grad_norm = clip_grad_norm(
 +            self.params, gradient_clip, sync=self._aggregate_gnorms
 +        )
         # detect overflow and adjust loss scale
         overflow = has_overflow(grad_norm)
         self.scaler.update_scale(overflow)
-Original file line number
+Diff line change
 +#!/usr/bin/env python3
++
 +# Copyright (c) Facebook, Inc. and its affiliates.
 +# This source code is licensed under the MIT license found in the
 +# LICENSE file in the root directory of this source tree.
++
 +"""
 +Utility functions for FullyShardedDataParallel.
 +"""
++
 +import contextlib
 +import torch.nn
 +from parlai.utils.distributed import is_distributed, get_dist_group
++
 +try:
 +    from fairscale.nn.wrap.auto_wrap import wrap
 +    from fairscale.nn.wrap.auto_wrap import enable_wrap as fairscale_enable_wrap
 +    from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
++
 +    FSDP_AVAILABLE = True
 +except ImportError:
 +    FSDP_AVAILABLE = False
++
 +    def wrap(module, **kwargs):
 +        return module
++
++
 +DEFAULT_DDP_BACKEND = "ddp"
++
++
 +def is_fsdp(module: torch.nn.Module):
 +    """
 +    Checks whether a module is fully sharded.
 +    """
 +    return FSDP_AVAILABLE and isinstance(module, FSDP)
++
++
 +def should_use_fsdp(opt):
 +    return (
 +        FSDP_AVAILABLE
 +        and is_distributed()
 +        and opt.get('ddp_backend', DEFAULT_DDP_BACKEND) in ('zero2', 'zero3')
 +    )
++
++
 +@contextlib.contextmanager
 +def maybe_fsdp_wrap(opt):
 +    """
 +    Context manager for enabling wrapping in FullyShardedDataParallel.
 +    """
 +    if not should_use_fsdp(opt):
 +        # make a no-op
 +        yield
 +        return
++
 +    # zero3 not supported at this time. Throw an exception
 +    if opt['ddp_backend'] == 'zero3':
 +        raise NotImplementedError(
 +            '--ddp-backend zero3 is not supported at this time. For details, see '
 +            'https://github.com/facebookresearch/ParlAI/issues/3753.'
 +        )
++
 +    reshard_after_forward = opt['ddp_backend'] == 'zero3'
 +    compute_dtype = torch.float16 if opt['fp16'] else torch.float32
 +    mixed_precision = opt['fp16'] and opt['fp16_impl'] == 'safe'
 +    fsdp_args = dict(
 +        reshard_after_forward=reshard_after_forward,
 +        mixed_precision=mixed_precision,
 +        compute_dtype=compute_dtype,
 +        state_dict_device=torch.device('cpu'),
 +        flatten_parameters=True,
 +        process_group=get_dist_group(),
 +    )
 +    with fairscale_enable_wrap(wrapper_cls=FSDP, **fsdp_args):
 +        yield
++
++
 +def delay_halving(opt):
 +    """
 +    Check whether we should keep the model in fp32 before other setup.
++
 +    When using Zero2 or Zero3 backends with mixed precision, we need to avoid converting
 +    the model to fp16, as the FSDP module does this for us.
++
 +    If we are using just plain DDP or MemoryEfficient optimizers, then we want
 +    to call half() early.
 +    """
++
 +    return opt['fp16'] and should_use_fsdp(opt) and opt['fp16_impl'] == 'safe'
++
++
 +def should_sync_gradnorm(opt):
 +    """
 +    Indicates whether fp16 optimizer wrappers should accumulate over workers.
++
 +    FP16 overflow detection and gradient clipping both require accumulating gradients
 +    across all workers when using FSDP, as workers only store a fraction of the
 +    gradients.
 +    """
 +    return (
 +        FSDP_AVAILABLE
 +        and opt['fp16']
 +        and opt.get('ddp_backend', DEFAULT_DDP_BACKEND) in ('zero2', 'zero3')
 +    )
++
++
 +def fsdp_wrap(module):
 +    """
 +    Helper function for wrapping the outermost root module.
 +    """
 +    return wrap(module)
-Original file line number
+Diff line change
 # LICENSE file in the root directory of this source tree.
 import os
 -import copy
 import unittest
 import parlai.utils.testing as testing_utils
 import parlai.scripts.build_dict as build_dict
 BATCHSIZE = 4
 -def _forced_parse(parser, opt):
 -    parser.set_params(**opt)
 -    parser.set_params(log_every_n_sec=10)
 -    popt = parser.parse_args([])
 -    # in some rare cases, like for instance if the model class also
 -    # overrides its default params, the params override will not
 -    # be taken into account.
 -    for k, v in opt.items():
 -        popt[k] = v
 -    return popt
 +class _AbstractTest(unittest.TestCase):
 +    def _distributed_train_model(self, **overrides):
 +        opt = {**self.base_config, **overrides}
 +        with testing_utils.tempdir() as tmpdir:
 +            if 'model_file' not in opt:
 +                opt['model_file'] = os.path.join(tmpdir, 'model')
 +            if 'dict_file' not in opt:
 +                opt['dict_file'] = os.path.join(tmpdir, 'model.dict')
++
 +            parser = mp_train.setup_args()
 +            popt = parser.parse_kwargs(**opt)
++
 +            # we need a prebuilt dictionary
 +            parser = build_dict.setup_args()
 +            build_dict.build_dict(popt)
++
 +            valid, test = mp_train.launch_and_train(popt)
++
 +        return (valid, test)
 @testing_utils.skipUnlessGPU
 -class TestDistributed(unittest.TestCase):
 -    _base_config = dict(
 +class TestDistributed(_AbstractTest):
 +    base_config = dict(
         task='integration_tests:overfit',
         model='transformer/generator',
         optimizer='adam',
         verbose=True,
+    )
 -    def setUp(self):
 -        print(f'[Setting up test {self._testMethodName}]')
+-
 -    def _distributed_train_model(self, opt):
 -        with testing_utils.tempdir() as tmpdir:
 -            if 'model_file' not in opt:
 -                opt['model_file'] = os.path.join(tmpdir, 'model')
 -            if 'dict_file' not in opt:
 -                opt['dict_file'] = os.path.join(tmpdir, 'model.dict')
+-
 -            parser = mp_train.setup_args()
 -            popt = _forced_parse(parser, opt)
+-
 -            # we need a prebuilt dictionary
 -            parser = build_dict.setup_args()
 -            build_dict.build_dict(popt)
+-
 -            valid, test = mp_train.launch_and_train(popt, 31338)
+-
 -        return (valid, test)
+-
     def test_generator_distributed(self):
 -        config = copy.deepcopy(self._base_config)
 -        valid, test = self._distributed_train_model(config)
 +        valid, test = self._distributed_train_model()
         self.assertLessEqual(valid['ppl'], 1.60)
         self.assertLessEqual(test['ppl'], 1.60)
         self.assertEqual(test['exs'].value(), BATCHSIZE)
     def test_multitask_distributed(self):
 -        config = copy.deepcopy(self._base_config)
 -        config['num_epochs'] = 50
 -        config['task'] = 'integration_tests:overfit,integration_tests:overfit_multiturn'
 -        config['dynb'] = 'full'
 -        valid, test = self._distributed_train_model(config)
 +        valid, test = self._distributed_train_model(
 +            num_epochs=50,
 +            task='integration_tests:overfit,integration_tests:overfit_multiturn',
 +            truncate=16,
 +        )
         self.assertLessEqual(valid['ppl'], 1.20)
         self.assertLessEqual(test['ppl'], 1.20)
+        )
     def test_distributed_eval_max_exs(self):
 -        config = copy.deepcopy(self._base_config)
 -        config['task'] = 'integration_tests'
 -        config['num_epochs'] = 0.01
 -        config['validation_max_exs'] = 90
 -        config['short_final_eval'] = True
 -        valid, test = self._distributed_train_model(config)
 +        valid, test = self._distributed_train_model(
 +            task='integration_tests',
 +            num_epochs=0.01,
 +            validation_max_exs=90,
 +            short_final_eval=True,
 +        )
         # Tests that DialogData.get() is doing the right thing
         # Ensure no duplication of examples among workers
         self.assertEqual(test['exs'].value(), 96)
     def test_distributed_eval_stream_mode(self):
 -        config = copy.deepcopy(self._base_config)
 -        config['task'] = 'integration_tests'
 -        config['num_epochs'] = 0.01
 -        config['datatype'] = 'train:stream'
 -        valid, test = self._distributed_train_model(config)
 +        valid, test = self._distributed_train_model(
 +            task='integration_tests', num_epochs=0.01, datatype='train:stream'
 +        )
         # Tests that StreamDialogData.get() is doing the right thing
         # Ensure no duplication of examples among workers
         self.assertEqual(test['exs'].value(), inttests.NUM_TEST)
     def test_distributed_eval_stream_mode_max_exs(self):
 -        config = copy.deepcopy(self._base_config)
 -        config['task'] = 'integration_tests'
 -        config['num_epochs'] = 0.01
 -        config['datatype'] = 'train:stream'
 -        config['validation_max_exs'] = 90
 -        config['short_final_eval'] = True
+-
 -        valid, test = self._distributed_train_model(config)
 +        valid, test = self._distributed_train_model(
 +            task='integration_tests',
 +            num_epochs=0.01,
 +            datatype='train:stream',
 +            validation_max_exs=90,
 +            short_final_eval=True,
 +        )
         # Tests that StreamDialogData.get() is doing the right thing
         # Ensure no duplication of examples among workers
         self.assertEqual(test['exs'].value(), 96)
     def test_chunked_dynamic_teacher(self):
 -        config = copy.deepcopy(self._base_config)
 -        config['task'] = 'integration_tests'
 -        config['num_epochs'] = 0.01
 -        config['datatype'] = 'train:stream'
 -        config['dynamic_batching'] = 'full'
 -        config['truncate'] = 16
+-
 -        valid, test = self._distributed_train_model(config)
 +        valid, test = self._distributed_train_model(
 +            task='integration_tests',
 +            num_epochs=0.01,
 +            datatype='train:stream',
 +            dynamic_batching='full',
 +            truncate=16,
 +        )
         assert valid['exs'].value() == inttests.NUM_TEST
         assert test['exs'].value() == inttests.NUM_TEST
     def test_chunked_teacher(self):
 -        config = copy.deepcopy(self._base_config)
 -        config['task'] = 'integration_tests'
 -        config['num_epochs'] = 0.01
 -        config['datatype'] = 'train:stream'
 -        config['num_epochs'] = 5
 -        config['dynamic_batching'] = None
+-
 -        valid, test = self._distributed_train_model(config)
 +        valid, test = self._distributed_train_model(
 +            task='integration_tests',
 +            datatype='train:stream',
 +            num_epochs=5,
 +            dynamic_batching=None,
 +        )
         assert valid['exs'].value() == inttests.NUM_TEST
         assert test['exs'].value() == inttests.NUM_TEST
++
 +@testing_utils.skipUnlessGPU
 +class TestZero2(TestDistributed):
 +    """
 +    Integration tests for zero2 FSDP.
 +    """
++
 +    base_config = {**TestDistributed.base_config, 'ddp_backend': 'zero2'}
++
++
 +@unittest.skip
 +@testing_utils.skipUnlessGPU
 +class TestZero3(TestDistributed):
 +    # Not supported at this time. See:
 +    # https://github.com/facebookresearch/ParlAI/pull/3740
 +    base_config = {**TestDistributed.base_config, 'ddp_backend': 'zero3'}
++
++
 +@testing_utils.skipUnlessGPU
 +class TestNoModelParallel(_AbstractTest):
 +    base_config = dict(
 +        task='integration_tests:overfit',
 +        optimizer='sgd',
 +        validation_metric='loss',
 +        learningrate=1e-2,
 +        batchsize=BATCHSIZE,
 +        validation_every_n_epochs=1,
 +        num_epochs=1,
 +        n_layers=1,
 +        n_heads=1,
 +        ffn_size=32,
 +        embedding_size=8,
 +        verbose=True,
 +    )
++
     def test_no_model_parallel(self):
         """
 -        Checks that we throw an error when combining mp_train with.
+-
 -        --model-parallel true.
 +        Checks that we throw an error when combining mp_train with --model-parallel.
         """
 -        config = copy.deepcopy(self._base_config)
 -        config['model_parallel'] = True
 -        for m in [
 -            'transformer/generator',
 -            'transformer/ranker',
 -            'transformer/classifier',
 -        ]:
 -            config['model'] = m
 +        for m in ['transformer/generator', 'transformer/ranker']:
             try:
 -                _ = self._distributed_train_model(config)
 +                _ = self._distributed_train_model(model=m, model_parallel=True)
             except RuntimeError:
                 pass
             else: