Skip to content

Commit 2d28a5c

Browse files
feat: Add SWE-bench benchmarking integration (stitionai#415)
- Add Docker-based evaluation harness - Implement comprehensive test coverage - Add SWE-bench dependencies - Support batch evaluation with proper error handling Fixes stitionai#415 Co-Authored-By: Erkin Alp Güney <[email protected]>
1 parent 3b98ed3 commit 2d28a5c

File tree

9 files changed

+485
-33
lines changed

9 files changed

+485
-33
lines changed

requirements.txt

+10-33
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,10 @@
1-
flask
2-
flask-cors
3-
toml
4-
urllib3
5-
requests
6-
colorama
7-
fastlogging
8-
Jinja2
9-
mistletoe
10-
markdownify
11-
pdfminer.six
12-
playwright
13-
pytest-playwright
14-
tiktoken
15-
ollama
16-
openai
17-
anthropic
18-
google-generativeai
19-
sqlmodel
20-
keybert
21-
GitPython
22-
netlify-py
23-
Markdown
24-
xhtml2pdf
25-
mistralai
26-
Flask-SocketIO
27-
eventlet
28-
groq
29-
duckduckgo-search
30-
orjson
31-
gevent
32-
gevent-websocket
33-
curl_cffi
1+
# Core dependencies
2+
datasets>=2.0.0
3+
docker>=6.0.0
4+
pytest>=7.0.0
5+
pytest-asyncio>=0.21.0
6+
pytest-cov>=4.1.0
7+
8+
# SWE-bench dependencies
9+
swebench>=0.1.0
10+
huggingface-hub>=0.19.0

src/benchmark/swebench/__init__.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""
2+
SWE-bench integration module for Devika.
3+
4+
This module provides integration with the SWE-bench benchmark for evaluating
5+
code generation capabilities on real-world GitHub issues.
6+
"""
7+
8+
from .swebench import SWEBenchRunner
9+
from .dataset import SWEBenchDataset
10+
from .evaluator import SWEBenchEvaluator
11+
from .reporter import SWEBenchReporter
12+
13+
__all__ = [
14+
'SWEBenchRunner',
15+
'SWEBenchDataset',
16+
'SWEBenchEvaluator',
17+
'SWEBenchReporter',
18+
]

src/benchmark/swebench/dataset.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""SWE-bench dataset loading and management."""
2+
3+
from typing import Dict, List, Optional
4+
from datasets import load_dataset
5+
6+
class SWEBenchDataset:
7+
"""Handler for SWE-bench dataset operations."""
8+
9+
def __init__(self, dataset_name: str = "princeton-nlp/SWE-bench"):
10+
"""Initialize dataset handler.
11+
12+
Args:
13+
dataset_name: HuggingFace dataset name
14+
"""
15+
self.dataset_name = dataset_name
16+
self.dataset = None
17+
18+
def load_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]:
19+
"""Load benchmark instances.
20+
21+
Args:
22+
instance_ids: Optional list of specific instances to load
23+
24+
Returns:
25+
List of benchmark instances
26+
"""
27+
if self.dataset is None:
28+
self.dataset = load_dataset(self.dataset_name, split='test')
29+
30+
if instance_ids:
31+
instances = [
32+
inst for inst in self.dataset
33+
if inst['instance_id'] in instance_ids
34+
]
35+
else:
36+
instances = list(self.dataset)
37+
38+
return instances

src/benchmark/swebench/evaluator.py

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
"""Docker-based evaluation harness for SWE-bench."""
2+
3+
import json
4+
import logging
5+
import os
6+
import subprocess
7+
import tempfile
8+
from pathlib import Path
9+
from typing import Dict, List, Optional
10+
11+
logger = logging.getLogger(__name__)
12+
13+
class SWEBenchEvaluator:
14+
"""Evaluator for running SWE-bench in Docker containers."""
15+
16+
def __init__(self, max_workers: int = 4, working_dir: Optional[Path] = None):
17+
"""Initialize evaluator.
18+
19+
Args:
20+
max_workers: Number of parallel workers
21+
working_dir: Working directory for evaluation files
22+
"""
23+
self.max_workers = max_workers
24+
self.working_dir = working_dir or Path(tempfile.mkdtemp(prefix='swebench_'))
25+
self.working_dir.mkdir(parents=True, exist_ok=True)
26+
27+
def evaluate_instances(
28+
self,
29+
instances: List[Dict],
30+
run_id: Optional[str] = None
31+
) -> Dict:
32+
"""Evaluate benchmark instances.
33+
34+
Args:
35+
instances: List of benchmark instances to evaluate
36+
run_id: Optional identifier for this evaluation run
37+
38+
Returns:
39+
Dictionary containing evaluation results
40+
"""
41+
results = {}
42+
run_dir = self.working_dir / (run_id or 'default')
43+
run_dir.mkdir(parents=True, exist_ok=True)
44+
45+
# Save predictions for batch evaluation
46+
predictions_dir = run_dir / 'predictions'
47+
predictions_dir.mkdir(parents=True, exist_ok=True)
48+
49+
for instance in instances:
50+
try:
51+
# Save instance prediction
52+
instance_dir = predictions_dir / instance['instance_id']
53+
instance_dir.mkdir(parents=True, exist_ok=True)
54+
with open(instance_dir / 'prediction.json', 'w') as f:
55+
json.dump(instance, f, indent=2)
56+
except Exception as e:
57+
logger.error(f"Error preparing {instance['instance_id']}: {e}")
58+
results[instance['instance_id']] = {
59+
'status': 'error',
60+
'error': f"Failed to prepare instance: {str(e)}"
61+
}
62+
63+
# Run batch evaluation using SWE-bench harness
64+
try:
65+
result = self._run_docker_evaluation(predictions_dir, run_id)
66+
results.update(self._parse_evaluation_results(result))
67+
except Exception as e:
68+
logger.error(f"Docker evaluation failed: {e}")
69+
for instance in instances:
70+
if instance['instance_id'] not in results:
71+
results[instance['instance_id']] = {
72+
'status': 'error',
73+
'error': f"Docker evaluation failed: {str(e)}"
74+
}
75+
76+
return results
77+
78+
def _run_docker_evaluation(self, predictions_dir: Path, run_id: str) -> str:
79+
"""Run Docker-based evaluation using SWE-bench harness.
80+
81+
Args:
82+
predictions_dir: Directory containing instance predictions
83+
run_id: Identifier for this evaluation run
84+
85+
Returns:
86+
Raw evaluation output
87+
"""
88+
cmd = [
89+
'python', '-m', 'swebench.harness.run_evaluation',
90+
'--predictions_path', str(predictions_dir),
91+
'--max_workers', str(self.max_workers),
92+
'--run_id', run_id or 'default'
93+
]
94+
95+
try:
96+
result = subprocess.run(
97+
cmd,
98+
capture_output=True,
99+
text=True,
100+
check=True
101+
)
102+
return result.stdout
103+
except subprocess.CalledProcessError as e:
104+
logger.error(f"Docker evaluation command failed: {e.output}")
105+
raise RuntimeError(f"Docker evaluation failed: {str(e)}")
106+
107+
def _parse_evaluation_results(self, output: str) -> Dict:
108+
"""Parse evaluation output to extract metrics.
109+
110+
Args:
111+
output: Raw evaluation output string
112+
113+
Returns:
114+
Dictionary containing parsed metrics per instance
115+
"""
116+
results = {}
117+
try:
118+
# Extract results from evaluation output
119+
# Format: instance_id: {metrics}
120+
for line in output.splitlines():
121+
if ':' in line:
122+
instance_id, metrics_str = line.split(':', 1)
123+
instance_id = instance_id.strip()
124+
try:
125+
metrics = json.loads(metrics_str.strip())
126+
results[instance_id] = {
127+
'status': 'success',
128+
'metrics': metrics
129+
}
130+
except json.JSONDecodeError:
131+
results[instance_id] = {
132+
'status': 'error',
133+
'error': f"Failed to parse metrics: {metrics_str}"
134+
}
135+
except Exception as e:
136+
logger.error(f"Failed to parse evaluation results: {e}")
137+
raise RuntimeError(f"Failed to parse evaluation results: {str(e)}")
138+
139+
return results

src/benchmark/swebench/reporter.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""Results reporting for SWE-bench benchmark."""
2+
3+
import json
4+
from pathlib import Path
5+
from typing import Dict
6+
7+
class SWEBenchReporter:
8+
"""Reporter for SWE-bench benchmark results."""
9+
10+
def generate_report(self, results: Dict) -> Dict:
11+
"""Generate benchmark report.
12+
13+
Args:
14+
results: Dictionary containing benchmark results
15+
16+
Returns:
17+
Dictionary containing formatted report
18+
"""
19+
report = {
20+
'summary': self._generate_summary(results),
21+
'details': results
22+
}
23+
return report
24+
25+
def save_report(self, report: Dict, output_file: Path):
26+
"""Save benchmark report to file.
27+
28+
Args:
29+
report: Dictionary containing benchmark report
30+
output_file: Path to save report
31+
"""
32+
with open(output_file, 'w') as f:
33+
json.dump(report, f, indent=2)
34+
35+
def _generate_summary(self, results: Dict) -> Dict:
36+
"""Generate summary statistics from results.
37+
38+
Args:
39+
results: Dictionary containing benchmark results
40+
41+
Returns:
42+
Dictionary containing summary statistics
43+
"""
44+
total = len(results)
45+
successful = sum(1 for r in results.values() if r.get('status') == 'success')
46+
failed = sum(1 for r in results.values() if r.get('status') == 'error')
47+
48+
return {
49+
'total_instances': total,
50+
'successful': successful,
51+
'failed': failed,
52+
'success_rate': successful / total if total > 0 else 0
53+
}

src/benchmark/swebench/swebench.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""Main SWE-bench runner implementation."""
2+
3+
import logging
4+
from pathlib import Path
5+
from typing import Dict, List, Optional
6+
7+
from .dataset import SWEBenchDataset
8+
from .evaluator import SWEBenchEvaluator
9+
from .reporter import SWEBenchReporter
10+
11+
logger = logging.getLogger(__name__)
12+
13+
class SWEBenchRunner:
14+
"""Main class for running SWE-bench benchmarks."""
15+
16+
def __init__(
17+
self,
18+
dataset_name: str = "princeton-nlp/SWE-bench",
19+
max_workers: int = 4,
20+
working_dir: Optional[Path] = None
21+
):
22+
"""Initialize SWE-bench runner.
23+
24+
Args:
25+
dataset_name: HuggingFace dataset name
26+
max_workers: Number of parallel workers for evaluation
27+
working_dir: Working directory for benchmark files
28+
"""
29+
self.dataset = SWEBenchDataset(dataset_name)
30+
self.evaluator = SWEBenchEvaluator(max_workers=max_workers)
31+
self.reporter = SWEBenchReporter()
32+
self.working_dir = working_dir or Path.cwd() / "swebench_results"
33+
self.working_dir.mkdir(parents=True, exist_ok=True)
34+
35+
def run_benchmark(
36+
self,
37+
instance_ids: Optional[List[str]] = None,
38+
run_id: Optional[str] = None
39+
) -> Dict:
40+
"""Run benchmark evaluation.
41+
42+
Args:
43+
instance_ids: Optional list of specific instances to evaluate
44+
run_id: Optional identifier for this benchmark run
45+
46+
Returns:
47+
Dictionary containing benchmark results
48+
"""
49+
logger.info("Loading benchmark dataset...")
50+
instances = self.dataset.load_instances(instance_ids)
51+
52+
logger.info("Running evaluations...")
53+
results = self.evaluator.evaluate_instances(instances, run_id)
54+
55+
logger.info("Generating report...")
56+
report = self.reporter.generate_report(results)
57+
58+
# Save results
59+
results_file = self.working_dir / f"results_{run_id or 'default'}.json"
60+
self.reporter.save_report(report, results_file)
61+
62+
return report

tests/benchmark/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Benchmark test package."""

tests/benchmark/conftest.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""Pytest configuration for benchmark tests."""
2+
3+
import pytest
4+
from pathlib import Path
5+
6+
@pytest.fixture
7+
def sample_instance():
8+
"""Sample benchmark instance for testing."""
9+
return {
10+
'instance_id': 'test_instance',
11+
'repo': 'test/repo',
12+
'issue': 'Sample issue description',
13+
'patch': 'Sample patch content'
14+
}
15+
16+
@pytest.fixture
17+
def sample_results():
18+
"""Sample benchmark results for testing."""
19+
return {
20+
'test_instance_1': {
21+
'status': 'success',
22+
'metrics': {'accuracy': 0.95}
23+
},
24+
'test_instance_2': {
25+
'status': 'error',
26+
'error': 'Test error message'
27+
}
28+
}

0 commit comments

Comments
 (0)