diff --git a/.asf.yaml b/.asf.yaml index a535528..0bbdb43 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -54,4 +54,4 @@ github: # Enable projects for project management boards projects: false # Enable GitHub discussion - discussions: false \ No newline at end of file + discussions: false diff --git a/.github/workflows/build-py-packages.yml b/.github/workflows/build-py-packages.yml index e95b201..4cb0805 100644 --- a/.github/workflows/build-py-packages.yml +++ b/.github/workflows/build-py-packages.yml @@ -132,4 +132,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: wheels-sdist - path: spatialbench-cli/dist \ No newline at end of file + path: spatialbench-cli/dist diff --git a/.github/workflows/packaging.yml b/.github/workflows/packaging.yml index afce8ef..eef3545 100644 --- a/.github/workflows/packaging.yml +++ b/.github/workflows/packaging.yml @@ -105,4 +105,4 @@ jobs: if: success() && github.repository == 'apache/sedona-spatialbench' run: | cd pages-clone - git push \ No newline at end of file + git push diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..54c0ad8 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: pre-commit + +on: [pull_request] + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} + +jobs: + pre-commit: + name: Run pre-commit # https://pre-commit.com/ + runs-on: ubuntu-latest + steps: + - name: 'Checkout ${{ github.ref }} ( ${{ github.sha }} )' + uses: actions/checkout@v6 + with: + persist-credentials: false + - uses: actions/setup-python@v6 # https://www.python.org/ + with: + python-version: '3.13' # Version range or exact version of a Python version to use, using SemVer's version range syntax + architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified + - name: Install dependencies # https://pip.pypa.io/en/stable/ + run: | + python -m pip install --upgrade pip + pip install pre-commit + - name: set PY + run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV + - uses: actions/cache@v5 + with: + path: ~/.cache/pre-commit + key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }} + - name: Run pre-commit + run: pre-commit run --color=always --all-files diff --git a/.github/workflows/spatialbench-cli-publish-pypi.yml b/.github/workflows/spatialbench-cli-publish-pypi.yml index 679da96..52e9ac1 100644 --- a/.github/workflows/spatialbench-cli-publish-pypi.yml +++ b/.github/workflows/spatialbench-cli-publish-pypi.yml @@ -45,4 +45,4 @@ jobs: uses: PyO3/maturin-action@v1 with: command: upload - args: --non-interactive --skip-existing dist/* \ No newline at end of file + args: --non-interactive --skip-existing dist/* diff --git a/.github/workflows/spatialbench-publish-crates.yml b/.github/workflows/spatialbench-publish-crates.yml index fe9221e..0ca8f77 100644 --- a/.github/workflows/spatialbench-publish-crates.yml +++ b/.github/workflows/spatialbench-publish-crates.yml @@ -30,4 +30,4 @@ jobs: working-directory: ${{ matrix.package }} run: cargo publish env: - CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }} \ No newline at end of file + CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }} diff --git a/.gitignore b/.gitignore index 0041ccf..3c269b4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ __old/ Cargo.lock .idea .venv/ -site/ \ No newline at end of file +site/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..02becc2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +--- +repos: + - repo: meta + hooks: + - id: identity + name: run identity check + description: check you have set your git identity + - id: check-hooks-apply + name: run check hooks apply + description: check that all the hooks apply to the repository + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-ast + name: run check-ast + description: check Python files for syntax errors + - id: check-builtin-literals + name: run check-builtin-literals + description: check Python files for proper use of built-in literals + - id: check-case-conflict + name: run check-case-conflict + description: check for case conflicts in file names + - id: check-docstring-first + name: run check-docstring-first + description: check that docstrings are at the start of functions + - id: check-executables-have-shebangs + name: run check-executables-have-shebangs + description: check that executable scripts have shebang lines + - id: check-illegal-windows-names + name: run check-illegal-windows-names + description: check for Windows-illegal file names + - id: check-json + name: run check-json + description: check JSON files for syntax errors + - id: check-merge-conflict + name: run check-merge-conflict + description: check for merge conflict markers + - id: check-shebang-scripts-are-executable + name: run check-shebang-scripts-are-executable + description: check that scripts with shebangs are executable + - id: check-toml + name: run check-toml + description: check TOML files for syntax errors + - id: check-vcs-permalinks + name: run check-vcs-permalinks + description: ensures that links to vcs websites are permalinks + - id: check-xml + name: run check-xml + description: attempts to load all xml files to verify syntax + - id: check-yaml + name: run check-yaml + description: attempts to load all yaml files to verify syntax + - id: debug-statements + name: run debug-statements + description: check for debugger imports and py37+ `breakpoint()` calls in python source. + - id: destroyed-symlinks + name: run destroyed-symlinks + description: detects symlinks which are changed to regular files with a content of a path which that symlink was pointing to + - id: detect-aws-credentials + name: run detect-aws-credentials + description: checks for the existence of AWS secrets that you have set up with the AWS CLI + args: [--allow-missing-credentials] + - id: detect-private-key + name: run detect-private-key + description: checks for the existence of private keys + - id: end-of-file-fixer + name: run end-of-file-fixer + description: makes sure files end in a newline and only a newline + - id: fix-byte-order-marker + name: run fix-byte-order-marker + description: removes UTF-8 byte order marker + - id: forbid-submodules + name: run forbid-submodules + description: forbids any submodules in the repository diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b5b1b28..7c85c85 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,4 +19,4 @@ # CONTRIBUTING.md -See the [contributors-guide.md](docs/contributors-guide.md) \ No newline at end of file +See the [contributors-guide.md](docs/contributors-guide.md) diff --git a/LICENSE b/LICENSE index f49a4e1..261eeb9 100644 --- a/LICENSE +++ b/LICENSE @@ -198,4 +198,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py old mode 100644 new mode 100755 index fca3c8e..e05e237 --- a/benchmark/run_benchmark.py +++ b/benchmark/run_benchmark.py @@ -65,7 +65,7 @@ class BenchmarkSuite: total_time: float = 0.0 timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) version: str = "unknown" - + def to_dict(self) -> dict[str, Any]: return { "engine": self.engine, @@ -99,7 +99,7 @@ def _run_query_in_process( query_sql: str | None, ): """Worker function to run a query in a separate process. - + This allows us to forcefully terminate queries that hang or consume too much memory, which SIGALRM cannot do for native code. """ @@ -107,7 +107,7 @@ def _run_query_in_process( # For Spatial Polars, ensure the package is imported first to register namespace if engine_class.__name__ == "SpatialPolarsBenchmark": import spatial_polars as _sp # noqa: F401 - + benchmark = engine_class(data_paths) benchmark.setup() try: @@ -133,17 +133,17 @@ def _run_query_in_process( def get_data_paths(data_dir: str) -> dict[str, str]: """Get paths to all data tables. - + Supports two data formats: 1. Directory format: table_name/*.parquet (e.g., building/building.1.parquet) 2. Single file format: table_name.parquet (e.g., building.parquet) - + Returns directory paths for directories containing parquet files. Both DuckDB, pandas, and SedonaDB can read all parquet files from a directory. """ data_path = Path(data_dir) paths = {} - + for table in TABLES: table_path = data_path / table # Check for directory format first (from HF: building/building.1.parquet) @@ -163,32 +163,32 @@ def get_data_paths(data_dir: str) -> dict[str, str]: matches = list(data_path.glob(f"{table}*.parquet")) if matches: paths[table] = str(matches[0]) - + return paths class BaseBenchmark(ABC): """Base class for benchmark runners.""" - + def __init__(self, data_paths: dict[str, str], engine_name: str): self.data_paths = data_paths self.engine_name = engine_name - + @abstractmethod def setup(self) -> None: """Initialize the benchmark environment.""" pass - + @abstractmethod def teardown(self) -> None: """Cleanup the benchmark environment.""" pass - + @abstractmethod def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: """Execute a query and return (row_count, result).""" pass - + def run_query(self, query_name: str, query: str | None = None, timeout: int = 1200) -> BenchmarkResult: """Run a single query with timeout handling.""" start_time = time.perf_counter() @@ -238,11 +238,11 @@ def run_query(self, query_name: str, query: str | None = None, timeout: int = 12 class DuckDBBenchmark(BaseBenchmark): """DuckDB benchmark runner.""" - + def __init__(self, data_paths: dict[str, str]): super().__init__(data_paths, "duckdb") self._conn = None - + def setup(self) -> None: import duckdb self._conn = duckdb.connect() @@ -254,12 +254,12 @@ def setup(self) -> None: if Path(path).is_dir(): parquet_path = str(Path(path) / "*.parquet") self._conn.execute(f"CREATE VIEW {table} AS SELECT * FROM read_parquet('{parquet_path}')") - + def teardown(self) -> None: if self._conn: self._conn.close() self._conn = None - + def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: result = self._conn.execute(query).fetchall() return len(result), result @@ -267,11 +267,11 @@ def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: class GeoPandasBenchmark(BaseBenchmark): """GeoPandas benchmark runner.""" - + def __init__(self, data_paths: dict[str, str]): super().__init__(data_paths, "geopandas") self._queries = None - + def setup(self) -> None: import importlib.util geopandas_path = Path(__file__).parent.parent / "spatialbench-queries" / "geopandas_queries.py" @@ -279,10 +279,10 @@ def setup(self) -> None: module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) self._queries = {f"q{i}": getattr(module, f"q{i}") for i in range(1, QUERY_COUNT + 1)} - + def teardown(self) -> None: self._queries = None - + def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: if query_name not in self._queries: raise ValueError(f"Query {query_name} not found") @@ -292,11 +292,11 @@ def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: class SedonaDBBenchmark(BaseBenchmark): """SedonaDB benchmark runner.""" - + def __init__(self, data_paths: dict[str, str]): super().__init__(data_paths, "sedonadb") self._sedona = None - + def setup(self) -> None: import sedonadb self._sedona = sedonadb.connect() @@ -306,10 +306,10 @@ def setup(self) -> None: if Path(path).is_dir(): parquet_path = str(Path(path) / "*.parquet") self._sedona.read_parquet(parquet_path).to_view(table, overwrite=True) - + def teardown(self) -> None: self._sedona = None - + def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: result = self._sedona.sql(query).to_pandas() return len(result), result @@ -317,15 +317,15 @@ def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: class SpatialPolarsBenchmark(BaseBenchmark): """Spatial Polars benchmark runner.""" - + def __init__(self, data_paths: dict[str, str]): super().__init__(data_paths, "spatial_polars") self._queries = None - + def setup(self) -> None: # spatial_polars package is already imported in _run_query_in_process # to register .spatial namespace before any module loading - + # Load query functions directly from the module import importlib.util query_file = Path(__file__).parent.parent / "spatialbench-queries" / "spatial_polars.py" @@ -333,10 +333,10 @@ def setup(self) -> None: module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) self._queries = {f"q{i}": getattr(module, f"q{i}") for i in range(1, QUERY_COUNT + 1)} - + def teardown(self) -> None: self._queries = None - + def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: if query_name not in self._queries: raise ValueError(f"Query {query_name} not found") @@ -347,7 +347,7 @@ def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: def get_sql_queries(dialect: str) -> dict[str, str]: """Get SQL queries for a specific dialect from print_queries.py.""" from print_queries import DuckDBSpatialBenchBenchmark, SedonaDBSpatialBenchBenchmark - + dialects = { "duckdb": DuckDBSpatialBenchBenchmark, "sedonadb": SedonaDBSpatialBenchBenchmark, @@ -364,7 +364,7 @@ def run_query_isolated( timeout: int, ) -> BenchmarkResult: """Run a single query in an isolated subprocess with hard timeout. - + This is more robust than SIGALRM because: 1. Native code (C++/Rust) can be forcefully terminated 2. Memory-hungry queries don't affect the main process @@ -375,20 +375,20 @@ def run_query_isolated( target=_run_query_in_process, args=(result_queue, engine_class, data_paths, query_name, query_sql), ) - + process.start() process.join(timeout=timeout) - + if process.is_alive(): # Query exceeded timeout - forcefully terminate process.terminate() process.join(timeout=5) # Give it 5 seconds to terminate gracefully - + if process.is_alive(): # Still alive - kill it process.kill() process.join(timeout=2) - + return BenchmarkResult( query=query_name, engine=engine_name, @@ -397,7 +397,7 @@ def run_query_isolated( status="timeout", error_message=f"Query {query_name} timed out after {timeout} seconds (process killed)", ) - + # Process completed - get result from queue try: result_data = result_queue.get_nowait() @@ -430,18 +430,18 @@ def run_benchmark( runs: int = 3, ) -> BenchmarkSuite: """Generic benchmark runner for any engine. - + Each query runs in an isolated subprocess to ensure: - Hard timeout enforcement (process can be killed) - Memory isolation (one query can't OOM the runner) - Crash isolation (one query crash doesn't affect others) - + If runs > 1 and the first run succeeds, additional runs are performed and the average time is reported for fair comparison. """ - + from importlib.metadata import version as pkg_version - + # Engine configurations configs = { "duckdb": { @@ -465,30 +465,30 @@ def run_benchmark( "queries_getter": lambda: {f"q{i}": None for i in range(1, QUERY_COUNT + 1)}, }, } - + config = configs[engine] version = config["version_getter"]() - + # Format engine name for display display_name = engine.replace("_", " ").title() - + print(f"\n{'=' * 60}") print(f"Running {display_name} Benchmark") print(f"{'=' * 60}") print(f"{display_name} version: {version}") if runs > 1: print(f"Runs per query: {runs} (average will be reported)") - + suite = BenchmarkSuite(engine=engine, scale_factor=scale_factor, version=version) all_queries = config["queries_getter"]() engine_class = config["class"] - + for query_name, query_sql in all_queries.items(): if queries and query_name not in queries: continue - + print(f" Running {query_name}...", end=" ", flush=True) - + # First run result = run_query_isolated( engine_class=engine_class, @@ -498,11 +498,11 @@ def run_benchmark( query_sql=query_sql, timeout=timeout, ) - + # If first run succeeded and we want multiple runs, do additional runs if result.status == "success" and runs > 1: run_times = [result.time_seconds] - + for run_num in range(2, runs + 1): additional_result = run_query_isolated( engine_class=engine_class, @@ -517,7 +517,7 @@ def run_benchmark( else: # If any subsequent run fails, just use successful runs break - + # Calculate average of all successful runs avg_time = round(sum(run_times) / len(run_times), 2) result = BenchmarkResult( @@ -533,11 +533,11 @@ def run_benchmark( print(f"{result.time_seconds}s ({result.row_count} rows)") else: print(f"{result.status.upper()}: {result.error_message}") - + suite.results.append(result) if result.status == "success": suite.total_time += result.time_seconds - + return suite @@ -546,12 +546,12 @@ def print_summary(results: list[BenchmarkSuite]) -> None: print(f"\n{'=' * 80}") print("BENCHMARK SUMMARY") print("=" * 80) - + all_queries = sorted( {r.query for suite in results for r in suite.results}, key=lambda x: int(x[1:]) ) - + data = { suite.engine: { r.query: f"{r.time_seconds:.2f}s" if r.status == "success" else r.status.upper() @@ -559,16 +559,16 @@ def print_summary(results: list[BenchmarkSuite]) -> None: } for suite in results } - + engines = [s.engine for s in results] header = f"{'Query':<10}" + "".join(f"{e:<15}" for e in engines) print(header) print("-" * len(header)) - + for query in all_queries: row = f"{query:<10}" + "".join(f"{data.get(e, {}).get(query, 'N/A'):<15}" for e in engines) print(row) - + print("-" * len(header)) print(f"{'Total':<10}" + "".join(f"{s.total_time:.2f}s{'':<9}" for s in results)) @@ -581,10 +581,10 @@ def save_results(results: list[BenchmarkSuite], output_file: str) -> None: "generated_at": datetime.now(timezone.utc).isoformat(), "results": [suite.to_dict() for suite in results], } - + with open(output_file, "w") as f: json.dump(output, f, indent=2) - + print(f"\nResults saved to {output_file}") @@ -606,33 +606,33 @@ def main(): help="Output file for results") parser.add_argument("--scale-factor", type=float, default=1, help="Scale factor of the data (for reporting only)") - + args = parser.parse_args() - + engines = [e.strip().lower() for e in args.engines.split(",")] valid_engines = {"duckdb", "geopandas", "sedonadb", "spatial_polars"} - + for e in engines: if e not in valid_engines: print(f"Error: Unknown engine '{e}'. Valid options: {valid_engines}") sys.exit(1) - + queries = [q.strip().lower() for q in args.queries.split(",")] if args.queries else None - + data_paths = get_data_paths(args.data_dir) if not data_paths: print(f"Error: No data files found in {args.data_dir}") sys.exit(1) - + print("Data paths:") for table, path in data_paths.items(): print(f" {table}: {path}") - + results = [ run_benchmark(engine, data_paths, queries, args.timeout, args.scale_factor, args.runs) for engine in engines ] - + print_summary(results) save_results(results, args.output) diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py old mode 100644 new mode 100755 index 5c08707..a52fcc8 --- a/benchmark/summarize_results.py +++ b/benchmark/summarize_results.py @@ -30,14 +30,14 @@ def load_results(results_dir: str) -> dict: """Load all JSON result files from a directory.""" results = {} results_path = Path(results_dir) - + for json_file in results_path.glob("*_results.json"): with open(json_file) as f: data = json.load(f) for suite in data.get("results", []): engine = suite["engine"] results[engine] = suite - + return results @@ -57,7 +57,7 @@ def get_winner(query: str, data: dict, engines: list) -> str | None: result = data.get(engine, {}).get(query, {}) if result.get("status") == "success" and result.get("time_seconds") is not None: times[engine] = result["time_seconds"] - + if not times: return None return min(times, key=times.get) @@ -66,34 +66,34 @@ def get_winner(query: str, data: dict, engines: list) -> str | None: def generate_markdown_summary(results: dict, output_file: str, query_timeout: int | None = None, runs: int | None = None) -> str: """Generate a markdown summary of benchmark results for GitHub Actions.""" engines = sorted(results.keys()) - + if not engines: markdown = "# šŸ“Š SpatialBench Benchmark Results\n\nāš ļø No results found." with open(output_file, "w") as f: f.write(markdown) return markdown - + # Get scale factor from first result scale_factor = results[engines[0]].get("scale_factor", 1) timestamp = results[engines[0]].get("timestamp", datetime.now(timezone.utc).isoformat()) - + # Collect all queries all_queries = set() for engine_data in results.values(): for r in engine_data.get("results", []): all_queries.add(r["query"]) all_queries = sorted(all_queries, key=lambda x: int(x[1:])) - + # Build result lookup data = {} for engine, engine_data in results.items(): data[engine] = {} for r in engine_data.get("results", []): data[engine][r["query"]] = r - + # Get version info versions = {engine: results[engine].get("version", "unknown") for engine in engines} - + # Engine display names with icons engine_icons = { "sedonadb": "🌵 SedonaDB", @@ -101,7 +101,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "geopandas": "🐼 GeoPandas", "spatial_polars": "šŸ»ā€ā„ļø Spatial Polars", } - + # Generate markdown lines = [ "# šŸ“Š SpatialBench Benchmark Results", @@ -119,11 +119,11 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "| Engine | Version |", "|--------|---------|", ] - + for engine in engines: icon_name = engine_icons.get(engine, engine.title()) lines.append(f"| {icon_name} | `{versions[engine]}` |") - + # Main results table lines.extend([ "", @@ -132,7 +132,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "| Query | " + " | ".join(engine_icons.get(e, e.title()) for e in engines) + " |", "|:------|" + "|".join(":---:" for _ in engines) + "|", ]) - + # Add rows for each query with winner highlighting for query in all_queries: winner = get_winner(query, data, engines) @@ -154,14 +154,14 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in else: row += " — |" lines.append(row) - + # Win count summary win_counts = {engine: 0 for engine in engines} for query in all_queries: winner = get_winner(query, data, engines) if winner: win_counts[winner] += 1 - + lines.extend([ "", "## šŸ„‡ Performance Summary", @@ -169,19 +169,19 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "| Engine | Wins |", "|--------|:----:|", ]) - + for engine in sorted(engines, key=lambda e: win_counts[e], reverse=True): icon_name = engine_icons.get(engine, engine.title()) wins = win_counts[engine] lines.append(f"| {icon_name} | {wins} |") - + # Detailed results section (collapsible) lines.extend([ "", "## šŸ“‹ Detailed Results", "", ]) - + for engine in engines: icon_name = engine_icons.get(engine, engine.title()) lines.extend([ @@ -191,32 +191,32 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "| Query | Time | Status | Rows |", "|:------|-----:|:------:|-----:|", ]) - + for query in all_queries: result = data.get(engine, {}).get(query, {}) time_str = format_time(result.get("time_seconds")) status = result.get("status", "N/A") rows = result.get("row_count") row_str = f"{rows:,}" if rows is not None else "—" - + status_emoji = { "success": "āœ…", "error": "āŒ", "timeout": "ā±ļø", }.get(status, "ā“") - + lines.append(f"| {query.upper()} | {time_str} | {status_emoji} | {row_str} |") - + lines.extend([ "", "", "", ]) - + # Add error details if any has_errors = False error_lines = ["## āš ļø Errors and Timeouts", ""] - + for engine in engines: engine_errors = [] for query in all_queries: @@ -227,7 +227,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in if len(error_msg) > 200: error_msg = error_msg[:200] + "..." engine_errors.append(f"- **{query.upper()}**: `{error_msg}`") - + if engine_errors: has_errors = True icon_name = engine_icons.get(engine, engine.title()) @@ -235,10 +235,10 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in error_lines.append("") error_lines.extend(engine_errors) error_lines.append("") - + if has_errors: lines.extend(error_lines) - + # Footer lines.extend([ "---", @@ -251,13 +251,13 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "", f"*Generated by [SpatialBench](https://github.com/apache/sedona-spatialbench) on {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}*", ]) - + markdown = "\n".join(lines) - + # Write to file with open(output_file, "w") as f: f.write(markdown) - + return markdown @@ -289,18 +289,18 @@ def main(): default=3, help="Number of runs per query (for reporting)", ) - + args = parser.parse_args() - + results = load_results(args.results_dir) - + if not results: print(f"No results found in {args.results_dir}") # Write empty summary with open(args.output, "w") as f: f.write("# SpatialBench Benchmark Results\n\nNo results found.") return - + markdown = generate_markdown_summary(results, args.output, args.timeout, args.runs) print(f"Summary written to {args.output}") print("\nPreview:") diff --git a/dev/release/README.md b/dev/release/README.md index 01db51e..7e3b58e 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -202,4 +202,4 @@ To publish, run this command: ```shell cargo publish -``` \ No newline at end of file +``` diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py index b9fd85a..88c900c 100644 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -56,4 +56,3 @@ print("OK") sys.exit(0) - diff --git a/dev/release/run-rat.sh b/dev/release/run-rat.sh index d5230c7..4f7d791 100755 --- a/dev/release/run-rat.sh +++ b/dev/release/run-rat.sh @@ -42,5 +42,3 @@ else echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" exit 1 fi - - diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 4589891..1c49380 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -279,4 +279,3 @@ TEST_SUCCESS=yes echo "Release candidate ${VERSION}-RC${RC_NUMBER} looks good!" exit 0 - diff --git a/docs/contributors-guide.md b/docs/contributors-guide.md index 5f33ccf..df8d609 100644 --- a/docs/contributors-guide.md +++ b/docs/contributors-guide.md @@ -175,4 +175,4 @@ To contribute to the SpatialBench documentation: * `mkdocs serve` - Start the live-reloading docs server. * `mkdocs build` - Build the documentation site. * `mkdocs -h` - Print help message and exit. -5. Push your changes and open a pull request. \ No newline at end of file +5. Push your changes and open a pull request. diff --git a/docs/javascripts/katex.js b/docs/javascripts/katex.js index 5abd19e..ec73954 100644 --- a/docs/javascripts/katex.js +++ b/docs/javascripts/katex.js @@ -18,4 +18,4 @@ if (window.document$) { document$.subscribe(renderAll); } -})(); \ No newline at end of file +})(); diff --git a/docs/queries.md b/docs/queries.md index 7cc4741..5edfd4a 100644 --- a/docs/queries.md +++ b/docs/queries.md @@ -609,4 +609,3 @@ ORDER BY t.t_tripkey ASC, distance_to_building ASC, b.b_buildingkey ASC ā”œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā”¼ā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā”¼ā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā”¼ā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā”¼ā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā•Œā”¤ │ 1 ┆ 01010000009f3c318dd43735405930… ┆ 8384 ┆ lavender ┆ 1.4195012994942622 │ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ - diff --git a/docs/requirements.txt b/docs/requirements.txt index e90aba0..01da27b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -12,7 +12,3 @@ notebook nbconvert pyproj ruff - - - - diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 644bdc4..32a30f6 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -105,4 +105,4 @@ /* Change the mobile nav header label text to white */ label.md-nav__title { color: #FFFFFF !important; -} \ No newline at end of file +} diff --git a/raster/generator.py b/raster/generator.py index 0e5d70b..5c11ab2 100644 --- a/raster/generator.py +++ b/raster/generator.py @@ -85,4 +85,4 @@ def create_multiband_landsat_like_cog( print(f"Saved: {filename}") # Example usage: -create_multiband_landsat_like_cog("output/synthetic_landsat_multiband.tif") \ No newline at end of file +create_multiband_landsat_like_cog("output/synthetic_landsat_multiband.tif") diff --git a/spatialbench-arrow/README.md b/spatialbench-arrow/README.md index 95147d8..0d902de 100644 --- a/spatialbench-arrow/README.md +++ b/spatialbench-arrow/README.md @@ -39,4 +39,4 @@ This crate ensures correct results using two methods. Please see [CONTRIBUTING.md] for more information on how to contribute to this project. -[CONTRIBUTING.md]: https://github.com/apache/sedona-spatialbench/blob/main/CONTRIBUTING.md \ No newline at end of file +[CONTRIBUTING.md]: https://github.com/apache/sedona-spatialbench/blob/main/CONTRIBUTING.md diff --git a/spatialbench-config.yml b/spatialbench-config.yml index 86ce5a8..bc93bae 100644 --- a/spatialbench-config.yml +++ b/spatialbench-config.yml @@ -42,4 +42,4 @@ building: pareto_alpha_city: 1.20 pareto_xm_city: 1.0 pareto_alpha_sub: 1.00 - pareto_xm_sub: 1.0 \ No newline at end of file + pareto_xm_sub: 1.0 diff --git a/spatialbench-queries/print_queries.py b/spatialbench-queries/print_queries.py index 6d4777f..90573f6 100755 --- a/spatialbench-queries/print_queries.py +++ b/spatialbench-queries/print_queries.py @@ -451,4 +451,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/spatialbench/data/README.md b/spatialbench/data/README.md index 10306f1..7bd938f 100644 --- a/spatialbench/data/README.md +++ b/spatialbench/data/README.md @@ -132,4 +132,4 @@ And then compare with `diff` ```shell diff -du /tmp/customer.c.tbl /tmp/customer.java.tbl -``` \ No newline at end of file +```