diff --git a/mason.py b/mason.py index 00238a42b..fafa64d0f 100644 --- a/mason.py +++ b/mason.py @@ -9,7 +9,9 @@ import sys import time +import backoff import beaker +import requests from rich.console import Console from rich.text import Text @@ -796,6 +798,9 @@ def main(): beaker_secrets = [secret.name for secret in beaker_client.secret.list()] whoami = beaker_client.user.get().name + # Increase timeout to 300s for large experiment specs. + beaker_client.TIMEOUT = 300 + full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands] if is_external_user: console.rule("[bold red]Non-Ai2 User Detected[/bold red]") @@ -822,8 +827,20 @@ def main(): budget=args.budget, retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries), ) - exp = beaker_client.experiment.create(spec=experiment_spec) - console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") + + @backoff.on_exception( + backoff.expo, + requests.exceptions.Timeout, + max_tries=5, + # Factor here is the multiplier for the backoff delay, in seconds. + factor=5, + ) + def launch_experiment(): + exp = beaker_client.experiment.create(spec=experiment_spec) + console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") + return exp + + launch_experiment() if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 277c64aa8..70ec3a00c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = "==3.12.*" dependencies = [ "accelerate>=1.10.1", "antlr4-python3-runtime==4.11", + "backoff>=2.2.1", "bitsandbytes>=0.44.1; platform_system != 'Darwin'", "datasets>=4.0.0", "debugpy>=1.8.13", diff --git a/uv.lock b/uv.lock index 17ac55a36..a230939ee 100644 --- a/uv.lock +++ b/uv.lock @@ -163,6 +163,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, ] +[[package]] +name = "backoff" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/d7/5bbeb12c44d7c4f2fb5b56abce497eb5ed9f34d85701de869acedd602619/backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba", size = 17001, upload-time = "2022-10-05T19:19:32.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148, upload-time = "2022-10-05T19:19:30.546Z" }, +] + [[package]] name = "backrefs" version = "5.9" @@ -1752,6 +1761,7 @@ source = { editable = "." } dependencies = [ { name = "accelerate" }, { name = "antlr4-python3-runtime" }, + { name = "backoff" }, { name = "bitsandbytes", marker = "sys_platform != 'darwin'" }, { name = "datasets" }, { name = "debugpy" }, @@ -1802,6 +1812,7 @@ dev = [ requires-dist = [ { name = "accelerate", specifier = ">=1.10.1" }, { name = "antlr4-python3-runtime", specifier = "==4.11" }, + { name = "backoff", specifier = ">=2.2.1" }, { name = "bitsandbytes", marker = "sys_platform != 'darwin'", specifier = ">=0.44.1" }, { name = "datasets", specifier = ">=4.0.0" }, { name = "debugpy", specifier = ">=1.8.13" },