Skip to content

Commit fcb4347

Browse files
Adds exponential backoff to the Beaker launch command, and increase the default timeout. (#1212)
* debugging beaker * Added exponential backoff. * Added backoff lib * ran linter
1 parent 70b0472 commit fcb4347

File tree

3 files changed

+31
-2
lines changed

3 files changed

+31
-2
lines changed

mason.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
import sys
1010
import time
1111

12+
import backoff
1213
import beaker
14+
import requests
1315
from rich.console import Console
1416
from rich.text import Text
1517

@@ -794,6 +796,9 @@ def main():
794796
beaker_secrets = [secret.name for secret in beaker_client.secret.list()]
795797
whoami = beaker_client.user.get().name
796798

799+
# Increase timeout to 300s for large experiment specs.
800+
beaker_client.TIMEOUT = 300
801+
797802
full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands]
798803
if is_external_user:
799804
console.rule("[bold red]Non-Ai2 User Detected[/bold red]")
@@ -820,8 +825,20 @@ def main():
820825
budget=args.budget,
821826
retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries),
822827
)
823-
exp = beaker_client.experiment.create(spec=experiment_spec)
824-
console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}")
828+
829+
@backoff.on_exception(
830+
backoff.expo,
831+
requests.exceptions.Timeout,
832+
max_tries=5,
833+
# Factor here is the multiplier for the backoff delay, in seconds.
834+
factor=5,
835+
)
836+
def launch_experiment():
837+
exp = beaker_client.experiment.create(spec=experiment_spec)
838+
console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}")
839+
return exp
840+
841+
launch_experiment()
825842

826843

827844
if __name__ == "__main__":

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ requires-python = "==3.12.*"
77
dependencies = [
88
"accelerate>=1.10.1",
99
"antlr4-python3-runtime==4.11",
10+
"backoff>=2.2.1",
1011
"bitsandbytes>=0.44.1; platform_system != 'Darwin'",
1112
"datasets>=4.0.0",
1213
"debugpy>=1.8.13",

uv.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)