-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_exp.py
154 lines (130 loc) · 4.67 KB
/
run_exp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import argparse
import logging
import bgym
from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
from agentlab.agents.generic_agent.generic_agent_prompt import GenericPromptFlags
from agentlab.llm.chat_api import OpenAIModelArgs
from agentlab.agents import dynamic_prompting as dp
from agentlab.experiments.study import Study
logging.getLogger().setLevel(logging.INFO)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run experiments with specified parameters",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="Example: python run_exp.py --benchmark miniwob --n-jobs 4",
)
# Add command line arguments with more detailed help
parser.add_argument(
"--reproducibility-mode",
action="store_true",
help='Enable reproducibility mode. This will "ask" agents to be deterministic '
"and prevent launching with local changes.",
)
parser.add_argument(
"--benchmark",
type=str,
default="miniwob",
choices=["miniwob_tiny_test", "miniwob", "workarena_l1", "webarena", "workarena_l2_agent_curriculum_eva", "workarena_l3_agent_curriculum_eval", "webarena_tiny"],
help="Benchmark to run on. Options include:\n"
"- miniwob_tiny_test: Small test benchmark\n"
"- miniwob: Standard miniwob benchmark\n"
"- workarena_l1/l2/l3: Different levels of workarena\n"
"- webarena_tiny: Tiny webarena benchmark\n"
"(default: %(default)s)",
)
parser.add_argument(
"--relaunch",
action="store_true",
help="Relaunch an existing study. This will continue incomplete "
"experiments and relaunch errored experiments.",
)
parser.add_argument(
"--n-jobs",
type=int,
default=1,
help="Number of parallel jobs to run. Use 1 for debugging in VSCode. "
"Set higher for parallel processing. (default: %(default)s)",
)
parser.add_argument(
"--n-relaunch",
type=int,
default=1,
help="Number of times to retry launching experiments. "
"Useful for handling transient errors. (default: %(default)s)",
)
parser.add_argument(
"--model-name",
type=str,
required=True,
)
return parser.parse_args()
def create_agent_args(model_name: str) -> GenericAgentArgs:
model_args = OpenAIModelArgs(
model_name=model_name, max_total_tokens=32_000, max_input_tokens=30_000, max_new_tokens=2_000, temperature=0.7
)
prompt_flags = GenericPromptFlags(
obs=dp.ObsFlags(
use_html=False,
use_ax_tree=True,
use_focused_element=False,
use_error_logs=True,
use_history=True,
use_past_error_logs=False,
use_action_history=True,
use_think_history=False,
use_diff=False,
html_type="pruned_html",
use_screenshot=False,
use_som=False,
extract_visible_tag=False,
extract_clickable_tag=False,
extract_coords="False",
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=False,
),
use_plan=False,
use_criticise=False,
use_thinking=True, # Remember to change this
use_memory=True,
use_concrete_example=True,
use_abstract_example=True,
use_hints=False,
enable_chat=False,
max_prompt_tokens=25_000,
be_cautious=True,
extra_instructions=None,
)
return GenericAgentArgs(
chat_model_args=model_args,
flags=prompt_flags,
)
def run_study(args: argparse.Namespace) -> None:
agent_args = [create_agent_args(model_name=args.model_name)]
if args.reproducibility_mode:
[a.set_reproducibility_mode() for a in agent_args]
if args.relaunch:
# relaunch an existing study
study = Study.load_most_recent(contains=None)
study.find_incomplete(include_errors=True)
else:
study = Study(agent_args, args.benchmark, logging_level_stdout=logging.WARNING)
study.run(
n_jobs=args.n_jobs,
parallel_backend="ray",
strict_reproducibility=args.reproducibility_mode,
n_relaunch=args.n_relaunch,
)
if args.reproducibility_mode:
study.append_to_journal(strict_reproducibility=True)
def main() -> None:
args = parse_args()
run_study(args)
if __name__ == "__main__":
main()