feat: Add SWE-bench benchmarking integration (stitionai#415)

devin-kuokka · erkinalp · devin-kuokka · commit 2d28a5cd475e · 2024-12-18T17:53:16.000Z
- Add Docker-based evaluation harness - Implement comprehensive test coverage - Add SWE-bench dependencies - Support batch evaluation with proper error handling Fixes stitionai#415 Co-Authored-By: Erkin Alp Güney <erkinalp9035@gmail.com>
diff --git a/requirements.txt b/requirements.txt
@@ -1,33 +1,10 @@
-flask
-flask-cors
-toml
-urllib3
-requests
-colorama
-fastlogging
-Jinja2
-mistletoe
-markdownify
-pdfminer.six
-playwright
-pytest-playwright
-tiktoken
-ollama
-openai
-anthropic
-google-generativeai
-sqlmodel
-keybert
-GitPython
-netlify-py
-Markdown
-xhtml2pdf
-mistralai
-Flask-SocketIO
-eventlet
-groq
-duckduckgo-search
-orjson
-gevent
-gevent-websocket
-curl_cffi
+# Core dependencies
+datasets>=2.0.0
+docker>=6.0.0
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+pytest-cov>=4.1.0
+
+# SWE-bench dependencies
+swebench>=0.1.0
+huggingface-hub>=0.19.0
diff --git a/src/benchmark/swebench/__init__.py b/src/benchmark/swebench/__init__.py
@@ -0,0 +1,18 @@
+"""
+SWE-bench integration module for Devika.
+
+This module provides integration with the SWE-bench benchmark for evaluating
+code generation capabilities on real-world GitHub issues.
+"""
+
+from .swebench import SWEBenchRunner
+from .dataset import SWEBenchDataset
+from .evaluator import SWEBenchEvaluator
+from .reporter import SWEBenchReporter
+
+__all__ = [
+    'SWEBenchRunner',
+    'SWEBenchDataset',
+    'SWEBenchEvaluator',
+    'SWEBenchReporter',
+]
diff --git a/src/benchmark/swebench/dataset.py b/src/benchmark/swebench/dataset.py
@@ -0,0 +1,38 @@
+"""SWE-bench dataset loading and management."""
+
+from typing import Dict, List, Optional
+from datasets import load_dataset
+
+class SWEBenchDataset:
+    """Handler for SWE-bench dataset operations."""
+
+    def __init__(self, dataset_name: str = "princeton-nlp/SWE-bench"):
+        """Initialize dataset handler.
+
+        Args:
+            dataset_name: HuggingFace dataset name
+        """
+        self.dataset_name = dataset_name
+        self.dataset = None
+
+    def load_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]:
+        """Load benchmark instances.
+
+        Args:
+            instance_ids: Optional list of specific instances to load
+
+        Returns:
+            List of benchmark instances
+        """
+        if self.dataset is None:
+            self.dataset = load_dataset(self.dataset_name, split='test')
+
+        if instance_ids:
+            instances = [
+                inst for inst in self.dataset
+                if inst['instance_id'] in instance_ids
+            ]
+        else:
+            instances = list(self.dataset)
+
+        return instances
diff --git a/src/benchmark/swebench/evaluator.py b/src/benchmark/swebench/evaluator.py
@@ -0,0 +1,139 @@
+"""Docker-based evaluation harness for SWE-bench."""
+
+import json
+import logging
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+class SWEBenchEvaluator:
+    """Evaluator for running SWE-bench in Docker containers."""
+
+    def __init__(self, max_workers: int = 4, working_dir: Optional[Path] = None):
+        """Initialize evaluator.
+
+        Args:
+            max_workers: Number of parallel workers
+            working_dir: Working directory for evaluation files
+        """
+        self.max_workers = max_workers
+        self.working_dir = working_dir or Path(tempfile.mkdtemp(prefix='swebench_'))
+        self.working_dir.mkdir(parents=True, exist_ok=True)
+
+    def evaluate_instances(
+        self,
+        instances: List[Dict],
+        run_id: Optional[str] = None
+    ) -> Dict:
+        """Evaluate benchmark instances.
+
+        Args:
+            instances: List of benchmark instances to evaluate
+            run_id: Optional identifier for this evaluation run
+
+        Returns:
+            Dictionary containing evaluation results
+        """
+        results = {}
+        run_dir = self.working_dir / (run_id or 'default')
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save predictions for batch evaluation
+        predictions_dir = run_dir / 'predictions'
+        predictions_dir.mkdir(parents=True, exist_ok=True)
+
+        for instance in instances:
+            try:
+                # Save instance prediction
+                instance_dir = predictions_dir / instance['instance_id']
+                instance_dir.mkdir(parents=True, exist_ok=True)
+                with open(instance_dir / 'prediction.json', 'w') as f:
+                    json.dump(instance, f, indent=2)
+            except Exception as e:
+                logger.error(f"Error preparing {instance['instance_id']}: {e}")
+                results[instance['instance_id']] = {
+                    'status': 'error',
+                    'error': f"Failed to prepare instance: {str(e)}"
+                }
+
+        # Run batch evaluation using SWE-bench harness
+        try:
+            result = self._run_docker_evaluation(predictions_dir, run_id)
+            results.update(self._parse_evaluation_results(result))
+        except Exception as e:
+            logger.error(f"Docker evaluation failed: {e}")
+            for instance in instances:
+                if instance['instance_id'] not in results:
+                    results[instance['instance_id']] = {
+                        'status': 'error',
+                        'error': f"Docker evaluation failed: {str(e)}"
+                    }
+
+        return results
+
+    def _run_docker_evaluation(self, predictions_dir: Path, run_id: str) -> str:
+        """Run Docker-based evaluation using SWE-bench harness.
+
+        Args:
+            predictions_dir: Directory containing instance predictions
+            run_id: Identifier for this evaluation run
+
+        Returns:
+            Raw evaluation output
+        """
+        cmd = [
+            'python', '-m', 'swebench.harness.run_evaluation',
+            '--predictions_path', str(predictions_dir),
+            '--max_workers', str(self.max_workers),
+            '--run_id', run_id or 'default'
+        ]
+
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            return result.stdout
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Docker evaluation command failed: {e.output}")
+            raise RuntimeError(f"Docker evaluation failed: {str(e)}")
+
+    def _parse_evaluation_results(self, output: str) -> Dict:
+        """Parse evaluation output to extract metrics.
+
+        Args:
+            output: Raw evaluation output string
+
+        Returns:
+            Dictionary containing parsed metrics per instance
+        """
+        results = {}
+        try:
+            # Extract results from evaluation output
+            # Format: instance_id: {metrics}
+            for line in output.splitlines():
+                if ':' in line:
+                    instance_id, metrics_str = line.split(':', 1)
+                    instance_id = instance_id.strip()
+                    try:
+                        metrics = json.loads(metrics_str.strip())
+                        results[instance_id] = {
+                            'status': 'success',
+                            'metrics': metrics
+                        }
+                    except json.JSONDecodeError:
+                        results[instance_id] = {
+                            'status': 'error',
+                            'error': f"Failed to parse metrics: {metrics_str}"
+                        }
+        except Exception as e:
+            logger.error(f"Failed to parse evaluation results: {e}")
+            raise RuntimeError(f"Failed to parse evaluation results: {str(e)}")
+
+        return results
diff --git a/src/benchmark/swebench/reporter.py b/src/benchmark/swebench/reporter.py
@@ -0,0 +1,53 @@
+"""Results reporting for SWE-bench benchmark."""
+
+import json
+from pathlib import Path
+from typing import Dict
+
+class SWEBenchReporter:
+    """Reporter for SWE-bench benchmark results."""
+
+    def generate_report(self, results: Dict) -> Dict:
+        """Generate benchmark report.
+
+        Args:
+            results: Dictionary containing benchmark results
+
+        Returns:
+            Dictionary containing formatted report
+        """
+        report = {
+            'summary': self._generate_summary(results),
+            'details': results
+        }
+        return report
+
+    def save_report(self, report: Dict, output_file: Path):
+        """Save benchmark report to file.
+
+        Args:
+            report: Dictionary containing benchmark report
+            output_file: Path to save report
+        """
+        with open(output_file, 'w') as f:
+            json.dump(report, f, indent=2)
+
+    def _generate_summary(self, results: Dict) -> Dict:
+        """Generate summary statistics from results.
+
+        Args:
+            results: Dictionary containing benchmark results
+
+        Returns:
+            Dictionary containing summary statistics
+        """
+        total = len(results)
+        successful = sum(1 for r in results.values() if r.get('status') == 'success')
+        failed = sum(1 for r in results.values() if r.get('status') == 'error')
+
+        return {
+            'total_instances': total,
+            'successful': successful,
+            'failed': failed,
+            'success_rate': successful / total if total > 0 else 0
+        }
diff --git a/src/benchmark/swebench/swebench.py b/src/benchmark/swebench/swebench.py
@@ -0,0 +1,62 @@
+"""Main SWE-bench runner implementation."""
+
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from .dataset import SWEBenchDataset
+from .evaluator import SWEBenchEvaluator
+from .reporter import SWEBenchReporter
+
+logger = logging.getLogger(__name__)
+
+class SWEBenchRunner:
+    """Main class for running SWE-bench benchmarks."""
+
+    def __init__(
+        self,
+        dataset_name: str = "princeton-nlp/SWE-bench",
+        max_workers: int = 4,
+        working_dir: Optional[Path] = None
+    ):
+        """Initialize SWE-bench runner.
+
+        Args:
+            dataset_name: HuggingFace dataset name
+            max_workers: Number of parallel workers for evaluation
+            working_dir: Working directory for benchmark files
+        """
+        self.dataset = SWEBenchDataset(dataset_name)
+        self.evaluator = SWEBenchEvaluator(max_workers=max_workers)
+        self.reporter = SWEBenchReporter()
+        self.working_dir = working_dir or Path.cwd() / "swebench_results"
+        self.working_dir.mkdir(parents=True, exist_ok=True)
+
+    def run_benchmark(
+        self,
+        instance_ids: Optional[List[str]] = None,
+        run_id: Optional[str] = None
+    ) -> Dict:
+        """Run benchmark evaluation.
+
+        Args:
+            instance_ids: Optional list of specific instances to evaluate
+            run_id: Optional identifier for this benchmark run
+
+        Returns:
+            Dictionary containing benchmark results
+        """
+        logger.info("Loading benchmark dataset...")
+        instances = self.dataset.load_instances(instance_ids)
+
+        logger.info("Running evaluations...")
+        results = self.evaluator.evaluate_instances(instances, run_id)
+
+        logger.info("Generating report...")
+        report = self.reporter.generate_report(results)
+
+        # Save results
+        results_file = self.working_dir / f"results_{run_id or 'default'}.json"
+        self.reporter.save_report(report, results_file)
+
+        return report
diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py
@@ -0,0 +1 @@
+"""Benchmark test package."""
diff --git a/tests/benchmark/conftest.py b/tests/benchmark/conftest.py
@@ -0,0 +1,28 @@
+"""Pytest configuration for benchmark tests."""
+
+import pytest
+from pathlib import Path
+
+@pytest.fixture
+def sample_instance():
+    """Sample benchmark instance for testing."""
+    return {
+        'instance_id': 'test_instance',
+        'repo': 'test/repo',
+        'issue': 'Sample issue description',
+        'patch': 'Sample patch content'
+    }
+
+@pytest.fixture
+def sample_results():
+    """Sample benchmark results for testing."""
+    return {
+        'test_instance_1': {
+            'status': 'success',
+            'metrics': {'accuracy': 0.95}
+        },
+        'test_instance_2': {
+            'status': 'error',
+            'error': 'Test error message'
+        }
+    }
diff --git a/tests/benchmark/test_swebench.py b/tests/benchmark/test_swebench.py