apache · prantogg · Jan 3, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/README.md b/README.md
@@ -179,6 +179,26 @@ spatialbench-cli --scale-factor 1 --mb-per-file 256 --output-dir sf1-parquet
 spatialbench-cli --scale-factor 10 --mb-per-file 256 --output-dir sf10-parquet
 ```
 
+#### Generate Data Directly to S3
+
+You can generate data directly to Amazon S3 or S3-compatible storage by providing an S3 URI as the output directory:
+
+```bash
+# Set AWS credentials
+export AWS_ACCESS_KEY_ID="your-access-key"
+export AWS_SECRET_ACCESS_KEY="your-secret-key"
+export AWS_REGION="us-west-2"  # Must match your bucket's region
+
+# Generate to S3
+spatialbench-cli --scale-factor 10 --mb-per-file 256 --output-dir s3://my-bucket/spatialbench/sf10
+
+# For S3-compatible services (MinIO, etc.)
+export AWS_ENDPOINT="http://localhost:9000"
+spatialbench-cli --scale-factor 1 --output-dir s3://my-bucket/data
+```
+
+The S3 writer uses streaming multipart upload, buffering data in 32MB chunks before uploading parts. This ensures memory-efficient generation even for large datasets. All output formats (Parquet, CSV, TBL) are supported, and the generated files are byte-for-byte identical to local generation.
+
 #### Custom Spider Configuration
 
 You can override these defaults at runtime by passing a YAML file via the `--config` flag:

diff --git a/spatialbench-cli/Cargo.toml b/spatialbench-cli/Cargo.toml
@@ -43,10 +43,11 @@ serde = { version = "1.0.219", features = ["derive"] }
 anyhow = "1.0.99"
 serde_yaml = "0.9.33"
 datafusion = "50.2"
-object_store = { version = "0.12.4", features = ["http"] }
+object_store = { version = "0.12.4", features = ["http", "aws"] }
 arrow-array = "56"
 arrow-schema = "56"
 url = "2.5.7"
+bytes = "1.10.1"
 
 [dev-dependencies]
 assert_cmd = "2.0"

diff --git a/spatialbench-cli/src/main.rs b/spatialbench-cli/src/main.rs
@@ -28,6 +28,7 @@ mod output_plan;
 mod parquet;
 mod plan;
 mod runner;
+mod s3_writer;
 mod spatial_config_file;
 mod statistics;
 mod tbl;
@@ -252,8 +253,9 @@ impl Cli {
             debug!("Logging configured from environment variables");
         }
 
-        // Create output directory if it doesn't exist and we are not writing to stdout.
-        if !self.stdout {
+        // Create output directory if it doesn't exist and we are not writing to stdout
+        // or to S3 (where local directories are meaningless).
+        if !self.stdout && !self.output_dir.to_string_lossy().starts_with("s3://") {
             fs::create_dir_all(&self.output_dir)?;
         }
 
@@ -386,21 +388,26 @@ impl Cli {
     }
 }
 
-impl IntoSize for BufWriter<Stdout> {
-    fn into_size(self) -> Result<usize, io::Error> {
-        // we can't get the size of stdout, so just return 0
+impl AsyncFinalize for BufWriter<Stdout> {
+    async fn finalize(self) -> Result<usize, io::Error> {
         Ok(0)
     }
 }
 
-impl IntoSize for BufWriter<File> {
-    fn into_size(self) -> Result<usize, io::Error> {
+impl AsyncFinalize for BufWriter<File> {
+    async fn finalize(self) -> Result<usize, io::Error> {
         let file = self.into_inner()?;
         let metadata = file.metadata()?;
         Ok(metadata.len() as usize)
     }
 }
 
+impl AsyncFinalize for s3_writer::S3Writer {
+    async fn finalize(self) -> Result<usize, io::Error> {
+        self.finish().await
+    }
+}
+
 /// Wrapper around a buffer writer that counts the number of buffers and bytes written
 struct WriterSink<W: Write> {
     statistics: WriteStatistics,

diff --git a/spatialbench-cli/src/output_plan.rs b/spatialbench-cli/src/output_plan.rs
@@ -20,21 +20,33 @@
 //! * [`OutputPlanGenerator`]: plans the output files to be generated
 
 use crate::plan::GenerationPlan;
+use crate::s3_writer::{build_s3_client, parse_s3_uri};
 use crate::{OutputFormat, Table};
 use log::debug;
+use object_store::ObjectStore;
 use parquet::basic::Compression;
 use std::collections::HashSet;
 use std::fmt::{Display, Formatter};
 use std::io;
 use std::path::PathBuf;
+use std::sync::Arc;
 
 /// Where a partition will be output
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
 pub enum OutputLocation {
     /// Output to a file
     File(PathBuf),
     /// Output to stdout
     Stdout,
+    /// Output to S3 with a shared client
+    S3 {
+        /// The full S3 URI for this object (e.g. `s3://bucket/path/to/file.parquet`)
+        uri: String,
+        /// The object path within the bucket (e.g. `path/to/file.parquet`)
+        path: String,
+        /// Shared S3 client for the bucket
+        client: Arc<dyn ObjectStore>,
+    },
 }
 
 impl Display for OutputLocation {
@@ -48,12 +60,13 @@ impl Display for OutputLocation {
                 write!(f, "{}", file.to_string_lossy())
             }
             OutputLocation::Stdout => write!(f, "Stdout"),
+            OutputLocation::S3 { uri, .. } => write!(f, "{}", uri),
         }
     }
 }
 
 /// Describes an output partition (file) that will be generated
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
 pub struct OutputPlan {
     /// The table
     table: Table,
@@ -151,6 +164,8 @@ pub struct OutputPlanGenerator {
     /// Output directories that have been created so far
     /// (used to avoid creating the same directory multiple times)
     created_directories: HashSet<PathBuf>,
+    /// Shared S3 client, lazily created on first S3 output location
+    s3_client: Option<Arc<dyn ObjectStore>>,
 }
 
 impl OutputPlanGenerator {
@@ -171,6 +186,7 @@ impl OutputPlanGenerator {
             output_dir,
             output_plans: Vec::new(),
             created_directories: HashSet::new(),
+            s3_client: None,
         }
     }
 
@@ -282,17 +298,48 @@ impl OutputPlanGenerator {
                 OutputFormat::Parquet => "parquet",
             };
 
-            let mut output_path = self.output_dir.clone();
-            if let Some(part) = part {
-                // If a partition is specified, create a subdirectory for it
-                output_path.push(table.to_string());
-                self.ensure_directory_exists(&output_path)?;
-                output_path.push(format!("{table}.{part}.{extension}"));
+            // Check if output_dir is an S3 URI
+            let output_dir_str = self.output_dir.to_string_lossy();
+            if output_dir_str.starts_with("s3://") {
+                // Handle S3 path
+                let base_uri = output_dir_str.trim_end_matches('/');
+                let s3_uri = if let Some(part) = part {
+                    format!("{base_uri}/{table}/{table}.{part}.{extension}")
+                } else {
+                    format!("{base_uri}/{table}.{extension}")
+                };
+
+                // Lazily build the S3 client on first use, then reuse it
+                let client = if let Some(ref client) = self.s3_client {
+                    Arc::clone(client)
+                } else {
+                    let (bucket, _) = parse_s3_uri(&s3_uri)?;
+                    let client = build_s3_client(&bucket)?;
+                    self.s3_client = Some(Arc::clone(&client));
+                    client
+                };
+
+                let (_, path) = parse_s3_uri(&s3_uri)?;
+
+                Ok(OutputLocation::S3 {
+                    uri: s3_uri,
+                    path,
+                    client,
+                })
             } else {
-                // No partition specified, output to a single file
-                output_path.push(format!("{table}.{extension}"));
+                // Handle local filesystem path
+                let mut output_path = self.output_dir.clone();
+                if let Some(part) = part {
+                    // If a partition is specified, create a subdirectory for it
+                    output_path.push(table.to_string());
+                    self.ensure_directory_exists(&output_path)?;
+                    output_path.push(format!("{table}.{part}.{extension}"));
+                } else {
+                    // No partition specified, output to a single file
+                    output_path.push(format!("{table}.{extension}"));
+                }
+                Ok(OutputLocation::File(output_path))
             }
-            Ok(OutputLocation::File(output_path))
         }
     }
 

diff --git a/spatialbench-cli/src/parquet.rs b/spatialbench-cli/src/parquet.rs
@@ -33,9 +33,17 @@ use std::io::Write;
 use std::sync::Arc;
 use tokio::sync::mpsc::{Receiver, Sender};
 
-pub trait IntoSize {
-    /// Convert the object into a size
-    fn into_size(self) -> Result<usize, io::Error>;
+/// Finalize a writer after all Parquet data has been written.
+///
+/// This is called from the async context (outside `spawn_blocking`) so
+/// that implementations like [`S3Writer`](crate::s3_writer::S3Writer) can
+/// `.await` their upload without competing with the tokio runtime for
+/// threads — avoiding deadlocks under concurrent plans.
+///
+/// For local files and stdout the implementation is trivially synchronous.
+pub trait AsyncFinalize: Write + Send + 'static {
+    /// Finalize the writer and return the total bytes written.
+    fn finalize(self) -> impl std::future::Future<Output = Result<usize, io::Error>> + Send;
 }
 
 /// Converts a set of RecordBatchIterators into a Parquet file
@@ -44,7 +52,7 @@ pub trait IntoSize {
 ///
 /// Note the input is an iterator of [`RecordBatchIterator`]; The batches
 /// produced by each iterator is encoded as its own row group.
-pub async fn generate_parquet<W: Write + Send + IntoSize + 'static, I>(
+pub async fn generate_parquet<W: AsyncFinalize, I>(
     writer: W,
     iter_iter: I,
     num_threads: usize,
@@ -119,9 +127,8 @@ where
             row_group_writer.close().unwrap();
             statistics.increment_chunks(1);
         }
-        let size = writer.into_inner()?.into_size()?;
-        statistics.increment_bytes(size);
-        Ok(()) as Result<(), io::Error>
+        let inner = writer.into_inner()?;
+        Ok((inner, statistics)) as Result<(W, WriteStatistics), io::Error>
     });
 
     // now, drive the input stream and send results to the writer task
@@ -135,8 +142,14 @@ where
     // signal the writer task that we are done
     drop(tx);
 
-    // Wait for the writer task to finish
-    writer_task.await??;
+    // Wait for the blocking writer task to return the underlying writer
+    let (inner, mut statistics) = writer_task.await??;
+
+    // Finalize in the async context so S3 uploads can .await without
+    // competing for tokio runtime threads (prevents deadlock under
+    // concurrent plans).
+    let size = inner.finalize().await?;
+    statistics.increment_bytes(size);
 
     Ok(())
 }

diff --git a/spatialbench-cli/src/plan.rs b/spatialbench-cli/src/plan.rs
@@ -77,6 +77,16 @@ pub struct GenerationPlan {
 
 pub const DEFAULT_PARQUET_ROW_GROUP_BYTES: i64 = 128 * 1024 * 1024;
 
+/// Buffer size for Parquet writing (32MB)
+///
+/// This buffer size is used for:
+/// - Local file writing with BufWriter
+/// - S3 multipart upload parts
+///
+/// The 32MB size provides good performance and is well above the AWS S3
+/// minimum part size requirement of 5MB for multipart uploads.
+pub const PARQUET_BUFFER_SIZE: usize = 32 * 1024 * 1024;
+
 impl GenerationPlan {
     /// Returns a GenerationPlan number of parts to generate
     ///
@@ -207,7 +217,7 @@ impl GenerationPlan {
         })
     }
 
-    /// Return the number of part(ititions) this plan will generate
+    /// Return the number of part(ition)s this plan will generate
     pub fn chunk_count(&self) -> usize {
         self.part_list.clone().count()
     }

diff --git a/spatialbench-cli/src/runner.rs b/spatialbench-cli/src/runner.rs
@@ -21,6 +21,7 @@ use crate::csv::*;
 use crate::generate::{generate_in_chunks, Source};
 use crate::output_plan::{OutputLocation, OutputPlan};
 use crate::parquet::generate_parquet;
+use crate::s3_writer::S3Writer;
 use crate::tbl::*;
 use crate::{OutputFormat, Table, WriterSink};
 use log::{debug, info};
@@ -32,6 +33,7 @@ use spatialbench_arrow::{
 };
 use std::io;
 use std::io::BufWriter;
+use std::sync::Arc;
 use tokio::task::{JoinError, JoinSet};
 
 /// Runs multiple [`OutputPlan`]s in parallel, managing the number of threads
@@ -218,6 +220,12 @@ where
             })?;
             Ok(())
         }
+        OutputLocation::S3 { uri, path, client } => {
+            info!("Writing to S3: {}", uri);
+            let s3_writer = S3Writer::with_client(Arc::clone(client), path);
+            let sink = WriterSink::new(s3_writer);
+            generate_in_chunks(sink, sources, num_threads).await
+        }
     }
 }
 
@@ -228,7 +236,7 @@ where
 {
     match plan.output_location() {
         OutputLocation::Stdout => {
-            let writer = BufWriter::with_capacity(32 * 1024 * 1024, io::stdout()); // 32MB buffer
+            let writer = BufWriter::with_capacity(crate::plan::PARQUET_BUFFER_SIZE, io::stdout());
             generate_parquet(writer, sources, num_threads, plan.parquet_compression()).await
         }
         OutputLocation::File(path) => {
@@ -242,7 +250,7 @@ where
             let file = std::fs::File::create(&temp_path).map_err(|err| {
                 io::Error::other(format!("Failed to create {temp_path:?}: {err}"))
             })?;
-            let writer = BufWriter::with_capacity(32 * 1024 * 1024, file); // 32MB buffer
+            let writer = BufWriter::with_capacity(crate::plan::PARQUET_BUFFER_SIZE, file);
             generate_parquet(writer, sources, num_threads, plan.parquet_compression()).await?;
             // rename the temp file to the final path
             std::fs::rename(&temp_path, path).map_err(|e| {
@@ -252,6 +260,11 @@ where
             })?;
             Ok(())
         }
+        OutputLocation::S3 { uri, path, client } => {
+            info!("Writing parquet to S3: {}", uri);
+            let s3_writer = S3Writer::with_client(Arc::clone(client), path);
+            generate_parquet(s3_writer, sources, num_threads, plan.parquet_compression()).await
+        }
     }
 }