Skip to content

Commit ff32eb3

Browse files
committed
✨ Add OpenClaw benchmarks
1 parent c1235c7 commit ff32eb3

5 files changed

Lines changed: 1051 additions & 1 deletion

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ Reproducible benchmarks for coding agents and models using Harbor
3636
| RedHatAI/Qwen3.6-35B-A3B-NVFP4 | Pi | [65.0%](./benchmarks/SWE_Bench_Qwen3.6_35b_NVFP4_Pi.md) | $51<sup>†</sup> |
3737
| RedHatAI/Qwen3.6-35B-A3B-NVFP4 | Qwen Code | [63.8%](./benchmarks/SWE_Bench_Qwen3.6_35b_NVFP4_Qwen_Code.md) | $37<sup>†</sup> |
3838
| RedHatAI/Qwen3.6-35B-A3B-NVFP4 | Claude Code | [63.2%](./benchmarks/SWE_Bench_Qwen3.6_35b_NVFP4_Claude_Code.md) | $48<sup>†</sup> |
39+
| RedHatAI/Qwen3.6-35B-A3B-NVFP4 | OpenClaw | [58.8%](./benchmarks/SWE_Bench_Qwen3.6_35b_NVFP4_OpenClaw.md) | $67<sup>†</sup> |
3940
| RedHatAI/Qwen3.6-35B-A3B-NVFP4 | OpenCode | [54.8%](./benchmarks/SWE_Bench_Qwen3.6_35b_NVFP4_OpenCode.md) | $67<sup>†</sup> |
4041

4142

Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
# SWE-Bench Pro Ansible Qwen3.6-35B-A3B-NVFP4 OpenClaw
2+
3+
## Benchmark
4+
5+
**Dataset:** [swe-bench-pro--ansible](https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro) (96 tasks, ansible only)
6+
**Model:** [RedHatAI/Qwen3.6-35B-A3B-NVFP4](https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4)
7+
**Harness:** OpenClaw
8+
**Environment:** Docker
9+
**Job Name:** 2026-06-01__10-32-15
10+
11+
## Results
12+
13+
**Score:** 40.6%
14+
**Errors (Initial Run):** 23
15+
**Total Time:** 01h 34m 02s
16+
**Agent Time:** 01h 10m 31s
17+
**Estimated Cost:** $9.4 ($4 / GPU / hr * 2 GPU * 01h 10m 31s)
18+
19+
## vLLM Server Config
20+
21+
**Manifest:** [Qwen3.6_35b_NVFP4.yml](../deploy/Qwen3.6_35b_NVFP4.yml)
22+
**Hardware:** 2x A100 40GB
23+
**Model Max Len:** 262,144
24+
**Max Concurrency:** 9.2x
25+
**Generation Config (Defaults):**
26+
- **Temperature:** 1.0
27+
- **Top p:** 0.95
28+
- **Top k:** 20
29+
30+
## Harbor Config
31+
32+
**Command:**
33+
34+
```bash
35+
export BENCHMARK='scale-ai/swe-bench-pro'
36+
export DATASET_PATTERN='*ansible*'
37+
export MODEL_NAME='qwen3.6-35b'
38+
export SERVER_URL='http://qwen36-35b-qwen36-35b.apps.ocp-beta-test.nerc.mghpcc.org'
39+
export OPENAI_BASE_URL=$SERVER_URL/v1
40+
export OPENAI_API_KEY='NONE'
41+
42+
harbor run --agent openclaw -d $BENCHMARK \
43+
-i $DATASET_PATTERN \
44+
-m openai/$MODEL_NAME \
45+
--agent-kwarg thinking=off \
46+
--n-concurrent 9
47+
48+
# Rerun twice to eliminate transient issues with LLM
49+
harbor jobs resume -p jobs/2026-06-01__10-32-15 -f NonZeroAgentExitCodeError -f RuntimeError
50+
harbor jobs resume -p jobs/2026-06-01__10-32-15 -f NonZeroAgentExitCodeError -f RuntimeError
51+
```
52+
53+
**`config.json`:**
54+
55+
```json
56+
{
57+
"job_name": "2026-06-01__10-32-15",
58+
"jobs_dir": "jobs",
59+
"n_attempts": 1,
60+
"timeout_multiplier": 1.0,
61+
"agent_timeout_multiplier": null,
62+
"verifier_timeout_multiplier": null,
63+
"agent_setup_timeout_multiplier": null,
64+
"environment_build_timeout_multiplier": null,
65+
"debug": false,
66+
"n_concurrent_trials": 9,
67+
"quiet": false,
68+
"retry": {
69+
"max_retries": 0,
70+
"include_exceptions": null,
71+
"exclude_exceptions": [
72+
"AgentTimeoutError",
73+
"VerifierOutputParseError",
74+
"RewardFileEmptyError",
75+
"RewardFileNotFoundError",
76+
"VerifierTimeoutError"
77+
],
78+
"wait_multiplier": 1.0,
79+
"min_wait_sec": 1.0,
80+
"max_wait_sec": 60.0
81+
},
82+
"environment": {
83+
"type": "docker",
84+
"import_path": null,
85+
"force_build": false,
86+
"delete": true,
87+
"cpu_enforcement_policy": "auto",
88+
"memory_enforcement_policy": "auto",
89+
"override_cpus": null,
90+
"override_memory_mb": null,
91+
"override_storage_mb": null,
92+
"override_gpus": null,
93+
"override_tpu": null,
94+
"mounts": null,
95+
"extra_docker_compose": [],
96+
"env": {},
97+
"kwargs": {},
98+
"extra_allowed_hosts": []
99+
},
100+
"verifier": {
101+
"override_timeout_sec": null,
102+
"max_timeout_sec": null,
103+
"env": {},
104+
"disable": false
105+
},
106+
"metrics": [],
107+
"agents": [
108+
{
109+
"name": "openclaw",
110+
"import_path": null,
111+
"model_name": "openai/qwen3.6-35b",
112+
"skills": [],
113+
"override_timeout_sec": null,
114+
"override_setup_timeout_sec": null,
115+
"max_timeout_sec": null,
116+
"extra_allowed_hosts": [],
117+
"kwargs": {
118+
"thinking": "off"
119+
},
120+
"env": {},
121+
"mcp_servers": []
122+
}
123+
],
124+
"datasets": [
125+
{
126+
"path": null,
127+
"name": "scale-ai/swe-bench-pro",
128+
"version": null,
129+
"ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d",
130+
"registry_url": null,
131+
"registry_path": null,
132+
"overwrite": false,
133+
"download_dir": null,
134+
"task_names": [
135+
"*ansible*"
136+
],
137+
"exclude_task_names": null,
138+
"n_tasks": null
139+
}
140+
],
141+
"tasks": [],
142+
"artifacts": [],
143+
"extra_instruction_paths": []
144+
}
145+
```
146+
147+
## `result.json`
148+
149+
```json
150+
{
151+
"id": "8b7c2223-d7dd-40f0-a639-313349450d65",
152+
"started_at": "2026-06-01T10:32:18.899562",
153+
"updated_at": "2026-06-01T15:21:20.134125",
154+
"finished_at": "2026-06-01T15:21:20.134125",
155+
"n_total_trials": 96,
156+
"stats": {
157+
"n_completed_trials": 96,
158+
"n_errored_trials": 5,
159+
"n_running_trials": 0,
160+
"n_pending_trials": 0,
161+
"n_cancelled_trials": 0,
162+
"n_retries": 0,
163+
"evals": {
164+
"openclaw__qwen3.6-35b__scale-ai/swe-bench-pro": {
165+
"n_trials": 96,
166+
"n_errors": 5,
167+
"metrics": [
168+
{
169+
"mean": 0.40625
170+
}
171+
],
172+
"pass_at_k": {},
173+
"reward_stats": {
174+
"reward": {
175+
"1.0": [
176+
"instance_ansible__ansible-d2f809__G4qYShN",
177+
"instance_ansible__ansible-f8ef34__3qtEMb7",
178+
"instance_ansible__ansible-ed6581__929Ky2r",
179+
"instance_ansible__ansible-a7d2a4__b9NDtPa",
180+
"instance_ansible__ansible-be59ca__NsEdc3x",
181+
"instance_ansible__ansible-3b823d__kdMFegq",
182+
"instance_ansible__ansible-12734f__zaLkv6t",
183+
"instance_ansible__ansible-a6e671__WDPc9Cf",
184+
"instance_ansible__ansible-526052__AaDtrAp",
185+
"instance_ansible__ansible-5c225d__Mswtjwu",
186+
"instance_ansible__ansible-cd9c4e__8AvpJ8V",
187+
"instance_ansible__ansible-935528__QS9epqL",
188+
"instance_ansible__ansible-3db08a__wpAT4v2",
189+
"instance_ansible__ansible-8127ab__vTXHYXE",
190+
"instance_ansible__ansible-a02e22__c86b8zV",
191+
"instance_ansible__ansible-fb144c__vQDb7bP",
192+
"instance_ansible__ansible-29aea9__wcMGrat",
193+
"instance_ansible__ansible-489156__SkD3rZG",
194+
"instance_ansible__ansible-d9f186__PmKWeya",
195+
"instance_ansible__ansible-0ea40e__qpCVJgc",
196+
"instance_ansible__ansible-189fcb__HGBtB4j",
197+
"instance_ansible__ansible-415e08__Mqouv7C",
198+
"instance_ansible__ansible-b8025a__tAuRGoy",
199+
"instance_ansible__ansible-d6d225__D5hbFVM",
200+
"instance_ansible__ansible-748f53__reMnqVw",
201+
"instance_ansible__ansible-ea04e0__SWYeTyE",
202+
"instance_ansible__ansible-be2c37__iq9Vzgj",
203+
"instance_ansible__ansible-b2a289__fBvew5v",
204+
"instance_ansible__ansible-984216__Nhu6YMP",
205+
"instance_ansible__ansible-e0c91a__tqN4yis",
206+
"instance_ansible__ansible-a20a52__8ccWSBa",
207+
"instance_ansible__ansible-379058__M6VkPmr",
208+
"instance_ansible__ansible-cb94c0__npmhAXw",
209+
"instance_ansible__ansible-f327e6__VXWCvYt",
210+
"instance_ansible__ansible-9142be__hNSQxcD",
211+
"instance_ansible__ansible-1ee70f__92pt4AJ",
212+
"instance_ansible__ansible-a26c32__AV8JGER",
213+
"instance_ansible__ansible-185d41__uaQzcxC",
214+
"instance_ansible__ansible-395e5e__cqKfWTb"
215+
],
216+
"0.0": [
217+
"instance_ansible__ansible-942424__eYpnQsP",
218+
"instance_ansible__ansible-b748ed__BeVA4D7",
219+
"instance_ansible__ansible-622a49__tAXjjTJ",
220+
"instance_ansible__ansible-5d253a__wy3uMxz",
221+
"instance_ansible__ansible-6cc974__PaXsoY6",
222+
"instance_ansible__ansible-9a21e2__CWj5eFa",
223+
"instance_ansible__ansible-cd473d__QanNHyj",
224+
"instance_ansible__ansible-a1569e__c2L2wmn",
225+
"instance_ansible__ansible-1b7026__qy5f25G",
226+
"instance_ansible__ansible-709484__uELUtxe",
227+
"instance_ansible__ansible-bf98f0__Ag2QcLc",
228+
"instance_ansible__ansible-e40889__yEf9PDu",
229+
"instance_ansible__ansible-949c50__zEXDfLm",
230+
"instance_ansible__ansible-0fd887__r2uGLmZ",
231+
"instance_ansible__ansible-d62496__LeCoVnY",
232+
"instance_ansible__ansible-deb54e__zb55HZr",
233+
"instance_ansible__ansible-d30fc6__JauvnvX",
234+
"instance_ansible__ansible-1a4644__urrU7qA",
235+
"instance_ansible__ansible-502270__6ZEvVbn",
236+
"instance_ansible__ansible-ecea15__DXaWkps",
237+
"instance_ansible__ansible-40ade1__5eErWDd",
238+
"instance_ansible__ansible-e22e10__hQe8Hqy",
239+
"instance_ansible__ansible-c1f2df__hT2coAc",
240+
"instance_ansible__ansible-11c177__fPhZiy5",
241+
"instance_ansible__ansible-9759e0__YK3t2jZ",
242+
"instance_ansible__ansible-f86c58__NCrtTu4",
243+
"instance_ansible__ansible-3889dd__A4H8JuT",
244+
"instance_ansible__ansible-5e3696__mR73qLU",
245+
"instance_ansible__ansible-eea46a__QxMbywo",
246+
"instance_ansible__ansible-83fb24__D8at8j4",
247+
"instance_ansible__ansible-83909b__RF2ATPR",
248+
"instance_ansible__ansible-4c5ce5__Zmc9TbP",
249+
"instance_ansible__ansible-de01db__WfbSy7J",
250+
"instance_ansible__ansible-e64c6c__dK2wMj2",
251+
"instance_ansible__ansible-de5858__aTSLTDa",
252+
"instance_ansible__ansible-5e88cd__8pTJSNR",
253+
"instance_ansible__ansible-d58e69__ANFKjCa",
254+
"instance_ansible__ansible-bec27f__Ck8dJLC",
255+
"instance_ansible__ansible-164881__JnijTps",
256+
"instance_ansible__ansible-776587__A3fkLrY",
257+
"instance_ansible__ansible-e9e600__zA7oaia",
258+
"instance_ansible__ansible-39bd8b__2Jako4k",
259+
"instance_ansible__ansible-34db57__HGYhp9G",
260+
"instance_ansible__ansible-811093__bur7KJC",
261+
"instance_ansible__ansible-1c06c4__LqTNW3j",
262+
"instance_ansible__ansible-5f4e33__peRymQY",
263+
"instance_ansible__ansible-b5e029__XXWvq7B",
264+
"instance_ansible__ansible-42355d__WbwFCXD",
265+
"instance_ansible__ansible-b6290e__2qoKQCK",
266+
"instance_ansible__ansible-d33bed__qvoDfPJ",
267+
"instance_ansible__ansible-c616e5__9m3Hff7",
268+
"instance_ansible__ansible-106909__RnMqzS7",
269+
"instance_ansible__ansible-564009__GLqBgaS",
270+
"instance_ansible__ansible-7e1a34__FwGgYdb",
271+
"instance_ansible__ansible-f02a62__pToCw65",
272+
"instance_ansible__ansible-1bd7dc__krBKcvE",
273+
"instance_ansible__ansible-d72025__mxPmiJz"
274+
]
275+
}
276+
},
277+
"exception_stats": {
278+
"NonZeroAgentExitCodeError": [
279+
"instance_ansible__ansible-b6290e__2qoKQCK",
280+
"instance_ansible__ansible-d33bed__qvoDfPJ",
281+
"instance_ansible__ansible-7e1a34__FwGgYdb",
282+
"instance_ansible__ansible-f02a62__pToCw65",
283+
"instance_ansible__ansible-d72025__mxPmiJz"
284+
]
285+
}
286+
}
287+
},
288+
"n_input_tokens": 0,
289+
"n_cache_tokens": 0,
290+
"n_output_tokens": 0,
291+
"cost_usd": null
292+
}
293+
}
294+
```

0 commit comments

Comments
 (0)