|
| 1 | +# SWE-Bench Pro Ansible Qwen3.6-35B-A3B-NVFP4 OpenClaw |
| 2 | + |
| 3 | +## Benchmark |
| 4 | + |
| 5 | +**Dataset:** [swe-bench-pro--ansible](https://huggingface.co/datasets/ScaleAI/SWE-bench_Pro) (96 tasks, ansible only) |
| 6 | +**Model:** [RedHatAI/Qwen3.6-35B-A3B-NVFP4](https://huggingface.co/RedHatAI/Qwen3.6-35B-A3B-NVFP4) |
| 7 | +**Harness:** OpenClaw |
| 8 | +**Environment:** Docker |
| 9 | +**Job Name:** 2026-06-01__10-32-15 |
| 10 | + |
| 11 | +## Results |
| 12 | + |
| 13 | +**Score:** 40.6% |
| 14 | +**Errors (Initial Run):** 23 |
| 15 | +**Total Time:** 01h 34m 02s |
| 16 | +**Agent Time:** 01h 10m 31s |
| 17 | +**Estimated Cost:** $9.4 ($4 / GPU / hr * 2 GPU * 01h 10m 31s) |
| 18 | + |
| 19 | +## vLLM Server Config |
| 20 | + |
| 21 | +**Manifest:** [Qwen3.6_35b_NVFP4.yml](../deploy/Qwen3.6_35b_NVFP4.yml) |
| 22 | +**Hardware:** 2x A100 40GB |
| 23 | +**Model Max Len:** 262,144 |
| 24 | +**Max Concurrency:** 9.2x |
| 25 | +**Generation Config (Defaults):** |
| 26 | +- **Temperature:** 1.0 |
| 27 | +- **Top p:** 0.95 |
| 28 | +- **Top k:** 20 |
| 29 | + |
| 30 | +## Harbor Config |
| 31 | + |
| 32 | +**Command:** |
| 33 | + |
| 34 | +```bash |
| 35 | +export BENCHMARK='scale-ai/swe-bench-pro' |
| 36 | +export DATASET_PATTERN='*ansible*' |
| 37 | +export MODEL_NAME='qwen3.6-35b' |
| 38 | +export SERVER_URL='http://qwen36-35b-qwen36-35b.apps.ocp-beta-test.nerc.mghpcc.org' |
| 39 | +export OPENAI_BASE_URL=$SERVER_URL/v1 |
| 40 | +export OPENAI_API_KEY='NONE' |
| 41 | + |
| 42 | +harbor run --agent openclaw -d $BENCHMARK \ |
| 43 | + -i $DATASET_PATTERN \ |
| 44 | + -m openai/$MODEL_NAME \ |
| 45 | + --agent-kwarg thinking=off \ |
| 46 | + --n-concurrent 9 |
| 47 | + |
| 48 | +# Rerun twice to eliminate transient issues with LLM |
| 49 | +harbor jobs resume -p jobs/2026-06-01__10-32-15 -f NonZeroAgentExitCodeError -f RuntimeError |
| 50 | +harbor jobs resume -p jobs/2026-06-01__10-32-15 -f NonZeroAgentExitCodeError -f RuntimeError |
| 51 | +``` |
| 52 | + |
| 53 | +**`config.json`:** |
| 54 | + |
| 55 | +```json |
| 56 | +{ |
| 57 | + "job_name": "2026-06-01__10-32-15", |
| 58 | + "jobs_dir": "jobs", |
| 59 | + "n_attempts": 1, |
| 60 | + "timeout_multiplier": 1.0, |
| 61 | + "agent_timeout_multiplier": null, |
| 62 | + "verifier_timeout_multiplier": null, |
| 63 | + "agent_setup_timeout_multiplier": null, |
| 64 | + "environment_build_timeout_multiplier": null, |
| 65 | + "debug": false, |
| 66 | + "n_concurrent_trials": 9, |
| 67 | + "quiet": false, |
| 68 | + "retry": { |
| 69 | + "max_retries": 0, |
| 70 | + "include_exceptions": null, |
| 71 | + "exclude_exceptions": [ |
| 72 | + "AgentTimeoutError", |
| 73 | + "VerifierOutputParseError", |
| 74 | + "RewardFileEmptyError", |
| 75 | + "RewardFileNotFoundError", |
| 76 | + "VerifierTimeoutError" |
| 77 | + ], |
| 78 | + "wait_multiplier": 1.0, |
| 79 | + "min_wait_sec": 1.0, |
| 80 | + "max_wait_sec": 60.0 |
| 81 | + }, |
| 82 | + "environment": { |
| 83 | + "type": "docker", |
| 84 | + "import_path": null, |
| 85 | + "force_build": false, |
| 86 | + "delete": true, |
| 87 | + "cpu_enforcement_policy": "auto", |
| 88 | + "memory_enforcement_policy": "auto", |
| 89 | + "override_cpus": null, |
| 90 | + "override_memory_mb": null, |
| 91 | + "override_storage_mb": null, |
| 92 | + "override_gpus": null, |
| 93 | + "override_tpu": null, |
| 94 | + "mounts": null, |
| 95 | + "extra_docker_compose": [], |
| 96 | + "env": {}, |
| 97 | + "kwargs": {}, |
| 98 | + "extra_allowed_hosts": [] |
| 99 | + }, |
| 100 | + "verifier": { |
| 101 | + "override_timeout_sec": null, |
| 102 | + "max_timeout_sec": null, |
| 103 | + "env": {}, |
| 104 | + "disable": false |
| 105 | + }, |
| 106 | + "metrics": [], |
| 107 | + "agents": [ |
| 108 | + { |
| 109 | + "name": "openclaw", |
| 110 | + "import_path": null, |
| 111 | + "model_name": "openai/qwen3.6-35b", |
| 112 | + "skills": [], |
| 113 | + "override_timeout_sec": null, |
| 114 | + "override_setup_timeout_sec": null, |
| 115 | + "max_timeout_sec": null, |
| 116 | + "extra_allowed_hosts": [], |
| 117 | + "kwargs": { |
| 118 | + "thinking": "off" |
| 119 | + }, |
| 120 | + "env": {}, |
| 121 | + "mcp_servers": [] |
| 122 | + } |
| 123 | + ], |
| 124 | + "datasets": [ |
| 125 | + { |
| 126 | + "path": null, |
| 127 | + "name": "scale-ai/swe-bench-pro", |
| 128 | + "version": null, |
| 129 | + "ref": "sha256:88411d32ff27e53a4c1a7e29f0c2aeba180c8e5d60f221cab5ed56325f33549d", |
| 130 | + "registry_url": null, |
| 131 | + "registry_path": null, |
| 132 | + "overwrite": false, |
| 133 | + "download_dir": null, |
| 134 | + "task_names": [ |
| 135 | + "*ansible*" |
| 136 | + ], |
| 137 | + "exclude_task_names": null, |
| 138 | + "n_tasks": null |
| 139 | + } |
| 140 | + ], |
| 141 | + "tasks": [], |
| 142 | + "artifacts": [], |
| 143 | + "extra_instruction_paths": [] |
| 144 | +} |
| 145 | +``` |
| 146 | + |
| 147 | +## `result.json` |
| 148 | + |
| 149 | +```json |
| 150 | +{ |
| 151 | + "id": "8b7c2223-d7dd-40f0-a639-313349450d65", |
| 152 | + "started_at": "2026-06-01T10:32:18.899562", |
| 153 | + "updated_at": "2026-06-01T15:21:20.134125", |
| 154 | + "finished_at": "2026-06-01T15:21:20.134125", |
| 155 | + "n_total_trials": 96, |
| 156 | + "stats": { |
| 157 | + "n_completed_trials": 96, |
| 158 | + "n_errored_trials": 5, |
| 159 | + "n_running_trials": 0, |
| 160 | + "n_pending_trials": 0, |
| 161 | + "n_cancelled_trials": 0, |
| 162 | + "n_retries": 0, |
| 163 | + "evals": { |
| 164 | + "openclaw__qwen3.6-35b__scale-ai/swe-bench-pro": { |
| 165 | + "n_trials": 96, |
| 166 | + "n_errors": 5, |
| 167 | + "metrics": [ |
| 168 | + { |
| 169 | + "mean": 0.40625 |
| 170 | + } |
| 171 | + ], |
| 172 | + "pass_at_k": {}, |
| 173 | + "reward_stats": { |
| 174 | + "reward": { |
| 175 | + "1.0": [ |
| 176 | + "instance_ansible__ansible-d2f809__G4qYShN", |
| 177 | + "instance_ansible__ansible-f8ef34__3qtEMb7", |
| 178 | + "instance_ansible__ansible-ed6581__929Ky2r", |
| 179 | + "instance_ansible__ansible-a7d2a4__b9NDtPa", |
| 180 | + "instance_ansible__ansible-be59ca__NsEdc3x", |
| 181 | + "instance_ansible__ansible-3b823d__kdMFegq", |
| 182 | + "instance_ansible__ansible-12734f__zaLkv6t", |
| 183 | + "instance_ansible__ansible-a6e671__WDPc9Cf", |
| 184 | + "instance_ansible__ansible-526052__AaDtrAp", |
| 185 | + "instance_ansible__ansible-5c225d__Mswtjwu", |
| 186 | + "instance_ansible__ansible-cd9c4e__8AvpJ8V", |
| 187 | + "instance_ansible__ansible-935528__QS9epqL", |
| 188 | + "instance_ansible__ansible-3db08a__wpAT4v2", |
| 189 | + "instance_ansible__ansible-8127ab__vTXHYXE", |
| 190 | + "instance_ansible__ansible-a02e22__c86b8zV", |
| 191 | + "instance_ansible__ansible-fb144c__vQDb7bP", |
| 192 | + "instance_ansible__ansible-29aea9__wcMGrat", |
| 193 | + "instance_ansible__ansible-489156__SkD3rZG", |
| 194 | + "instance_ansible__ansible-d9f186__PmKWeya", |
| 195 | + "instance_ansible__ansible-0ea40e__qpCVJgc", |
| 196 | + "instance_ansible__ansible-189fcb__HGBtB4j", |
| 197 | + "instance_ansible__ansible-415e08__Mqouv7C", |
| 198 | + "instance_ansible__ansible-b8025a__tAuRGoy", |
| 199 | + "instance_ansible__ansible-d6d225__D5hbFVM", |
| 200 | + "instance_ansible__ansible-748f53__reMnqVw", |
| 201 | + "instance_ansible__ansible-ea04e0__SWYeTyE", |
| 202 | + "instance_ansible__ansible-be2c37__iq9Vzgj", |
| 203 | + "instance_ansible__ansible-b2a289__fBvew5v", |
| 204 | + "instance_ansible__ansible-984216__Nhu6YMP", |
| 205 | + "instance_ansible__ansible-e0c91a__tqN4yis", |
| 206 | + "instance_ansible__ansible-a20a52__8ccWSBa", |
| 207 | + "instance_ansible__ansible-379058__M6VkPmr", |
| 208 | + "instance_ansible__ansible-cb94c0__npmhAXw", |
| 209 | + "instance_ansible__ansible-f327e6__VXWCvYt", |
| 210 | + "instance_ansible__ansible-9142be__hNSQxcD", |
| 211 | + "instance_ansible__ansible-1ee70f__92pt4AJ", |
| 212 | + "instance_ansible__ansible-a26c32__AV8JGER", |
| 213 | + "instance_ansible__ansible-185d41__uaQzcxC", |
| 214 | + "instance_ansible__ansible-395e5e__cqKfWTb" |
| 215 | + ], |
| 216 | + "0.0": [ |
| 217 | + "instance_ansible__ansible-942424__eYpnQsP", |
| 218 | + "instance_ansible__ansible-b748ed__BeVA4D7", |
| 219 | + "instance_ansible__ansible-622a49__tAXjjTJ", |
| 220 | + "instance_ansible__ansible-5d253a__wy3uMxz", |
| 221 | + "instance_ansible__ansible-6cc974__PaXsoY6", |
| 222 | + "instance_ansible__ansible-9a21e2__CWj5eFa", |
| 223 | + "instance_ansible__ansible-cd473d__QanNHyj", |
| 224 | + "instance_ansible__ansible-a1569e__c2L2wmn", |
| 225 | + "instance_ansible__ansible-1b7026__qy5f25G", |
| 226 | + "instance_ansible__ansible-709484__uELUtxe", |
| 227 | + "instance_ansible__ansible-bf98f0__Ag2QcLc", |
| 228 | + "instance_ansible__ansible-e40889__yEf9PDu", |
| 229 | + "instance_ansible__ansible-949c50__zEXDfLm", |
| 230 | + "instance_ansible__ansible-0fd887__r2uGLmZ", |
| 231 | + "instance_ansible__ansible-d62496__LeCoVnY", |
| 232 | + "instance_ansible__ansible-deb54e__zb55HZr", |
| 233 | + "instance_ansible__ansible-d30fc6__JauvnvX", |
| 234 | + "instance_ansible__ansible-1a4644__urrU7qA", |
| 235 | + "instance_ansible__ansible-502270__6ZEvVbn", |
| 236 | + "instance_ansible__ansible-ecea15__DXaWkps", |
| 237 | + "instance_ansible__ansible-40ade1__5eErWDd", |
| 238 | + "instance_ansible__ansible-e22e10__hQe8Hqy", |
| 239 | + "instance_ansible__ansible-c1f2df__hT2coAc", |
| 240 | + "instance_ansible__ansible-11c177__fPhZiy5", |
| 241 | + "instance_ansible__ansible-9759e0__YK3t2jZ", |
| 242 | + "instance_ansible__ansible-f86c58__NCrtTu4", |
| 243 | + "instance_ansible__ansible-3889dd__A4H8JuT", |
| 244 | + "instance_ansible__ansible-5e3696__mR73qLU", |
| 245 | + "instance_ansible__ansible-eea46a__QxMbywo", |
| 246 | + "instance_ansible__ansible-83fb24__D8at8j4", |
| 247 | + "instance_ansible__ansible-83909b__RF2ATPR", |
| 248 | + "instance_ansible__ansible-4c5ce5__Zmc9TbP", |
| 249 | + "instance_ansible__ansible-de01db__WfbSy7J", |
| 250 | + "instance_ansible__ansible-e64c6c__dK2wMj2", |
| 251 | + "instance_ansible__ansible-de5858__aTSLTDa", |
| 252 | + "instance_ansible__ansible-5e88cd__8pTJSNR", |
| 253 | + "instance_ansible__ansible-d58e69__ANFKjCa", |
| 254 | + "instance_ansible__ansible-bec27f__Ck8dJLC", |
| 255 | + "instance_ansible__ansible-164881__JnijTps", |
| 256 | + "instance_ansible__ansible-776587__A3fkLrY", |
| 257 | + "instance_ansible__ansible-e9e600__zA7oaia", |
| 258 | + "instance_ansible__ansible-39bd8b__2Jako4k", |
| 259 | + "instance_ansible__ansible-34db57__HGYhp9G", |
| 260 | + "instance_ansible__ansible-811093__bur7KJC", |
| 261 | + "instance_ansible__ansible-1c06c4__LqTNW3j", |
| 262 | + "instance_ansible__ansible-5f4e33__peRymQY", |
| 263 | + "instance_ansible__ansible-b5e029__XXWvq7B", |
| 264 | + "instance_ansible__ansible-42355d__WbwFCXD", |
| 265 | + "instance_ansible__ansible-b6290e__2qoKQCK", |
| 266 | + "instance_ansible__ansible-d33bed__qvoDfPJ", |
| 267 | + "instance_ansible__ansible-c616e5__9m3Hff7", |
| 268 | + "instance_ansible__ansible-106909__RnMqzS7", |
| 269 | + "instance_ansible__ansible-564009__GLqBgaS", |
| 270 | + "instance_ansible__ansible-7e1a34__FwGgYdb", |
| 271 | + "instance_ansible__ansible-f02a62__pToCw65", |
| 272 | + "instance_ansible__ansible-1bd7dc__krBKcvE", |
| 273 | + "instance_ansible__ansible-d72025__mxPmiJz" |
| 274 | + ] |
| 275 | + } |
| 276 | + }, |
| 277 | + "exception_stats": { |
| 278 | + "NonZeroAgentExitCodeError": [ |
| 279 | + "instance_ansible__ansible-b6290e__2qoKQCK", |
| 280 | + "instance_ansible__ansible-d33bed__qvoDfPJ", |
| 281 | + "instance_ansible__ansible-7e1a34__FwGgYdb", |
| 282 | + "instance_ansible__ansible-f02a62__pToCw65", |
| 283 | + "instance_ansible__ansible-d72025__mxPmiJz" |
| 284 | + ] |
| 285 | + } |
| 286 | + } |
| 287 | + }, |
| 288 | + "n_input_tokens": 0, |
| 289 | + "n_cache_tokens": 0, |
| 290 | + "n_output_tokens": 0, |
| 291 | + "cost_usd": null |
| 292 | + } |
| 293 | +} |
| 294 | +``` |
0 commit comments