From 7d4b90fc6461991348492eefe5d9d52e43914b7f Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 19 Nov 2025 07:30:59 -0700 Subject: [PATCH 1/4] debugging beaker --- mason.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mason.py b/mason.py index 00238a42b..e6fec5138 100644 --- a/mason.py +++ b/mason.py @@ -795,7 +795,9 @@ def main(): beaker_client = beaker.Beaker.from_env() beaker_secrets = [secret.name for secret in beaker_client.secret.list()] whoami = beaker_client.user.get().name - + print(f"{beaker_client.MAX_RETRIES=}") + print(f"{beaker_client.TIMEOUT=}") + return full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands] if is_external_user: console.rule("[bold red]Non-Ai2 User Detected[/bold red]") From 2e0acea9dce70c699daf3503ef7dce5484a2c8ee Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 19 Nov 2025 07:45:59 -0700 Subject: [PATCH 2/4] Added exponential backoff. --- mason.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/mason.py b/mason.py index e6fec5138..a23a78325 100644 --- a/mason.py +++ b/mason.py @@ -1,4 +1,5 @@ import argparse +import backoff import hashlib import os import random @@ -795,9 +796,10 @@ def main(): beaker_client = beaker.Beaker.from_env() beaker_secrets = [secret.name for secret in beaker_client.secret.list()] whoami = beaker_client.user.get().name - print(f"{beaker_client.MAX_RETRIES=}") - print(f"{beaker_client.TIMEOUT=}") - return + + # Increase timeout to 300s for large experiment specs. + beaker_client.TIMEOUT = 300 + full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands] if is_external_user: console.rule("[bold red]Non-Ai2 User Detected[/bold red]") @@ -824,8 +826,20 @@ def main(): budget=args.budget, retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries), ) - exp = beaker_client.experiment.create(spec=experiment_spec) - console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") + + @backoff.on_exception( + backoff.expo, + requests.exceptions.Timeout, + max_tries=5, + # Factor here is the multiplier for the backoff delay, in seconds. + factor=5, + ) + def launch_experiment(): + exp = beaker_client.experiment.create(spec=experiment_spec) + console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") + return exp + + launch_epxeriment() if __name__ == "__main__": From 1ba00c47a1a3d29a0d6cc36f4ce59b0421da0585 Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 19 Nov 2025 07:46:42 -0700 Subject: [PATCH 3/4] Added backoff lib --- pyproject.toml | 1 + uv.lock | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 277c64aa8..70ec3a00c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = "==3.12.*" dependencies = [ "accelerate>=1.10.1", "antlr4-python3-runtime==4.11", + "backoff>=2.2.1", "bitsandbytes>=0.44.1; platform_system != 'Darwin'", "datasets>=4.0.0", "debugpy>=1.8.13", diff --git a/uv.lock b/uv.lock index 17ac55a36..a230939ee 100644 --- a/uv.lock +++ b/uv.lock @@ -163,6 +163,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, ] +[[package]] +name = "backoff" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/d7/5bbeb12c44d7c4f2fb5b56abce497eb5ed9f34d85701de869acedd602619/backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba", size = 17001, upload-time = "2022-10-05T19:19:32.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148, upload-time = "2022-10-05T19:19:30.546Z" }, +] + [[package]] name = "backrefs" version = "5.9" @@ -1752,6 +1761,7 @@ source = { editable = "." } dependencies = [ { name = "accelerate" }, { name = "antlr4-python3-runtime" }, + { name = "backoff" }, { name = "bitsandbytes", marker = "sys_platform != 'darwin'" }, { name = "datasets" }, { name = "debugpy" }, @@ -1802,6 +1812,7 @@ dev = [ requires-dist = [ { name = "accelerate", specifier = ">=1.10.1" }, { name = "antlr4-python3-runtime", specifier = "==4.11" }, + { name = "backoff", specifier = ">=2.2.1" }, { name = "bitsandbytes", marker = "sys_platform != 'darwin'", specifier = ">=0.44.1" }, { name = "datasets", specifier = ">=4.0.0" }, { name = "debugpy", specifier = ">=1.8.13" }, From 58d4bcf6b0a5effcf911376ba6a619d0c409f3cf Mon Sep 17 00:00:00 2001 From: Finbarr Timbers Date: Wed, 19 Nov 2025 07:52:35 -0700 Subject: [PATCH 4/4] ran linter --- mason.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mason.py b/mason.py index a23a78325..fafa64d0f 100644 --- a/mason.py +++ b/mason.py @@ -1,5 +1,4 @@ import argparse -import backoff import hashlib import os import random @@ -10,7 +9,9 @@ import sys import time +import backoff import beaker +import requests from rich.console import Console from rich.text import Text @@ -839,7 +840,7 @@ def launch_experiment(): console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") return exp - launch_epxeriment() + launch_experiment() if __name__ == "__main__":