Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,26 @@ spatialbench-cli --scale-factor 1 --mb-per-file 256 --output-dir sf1-parquet
spatialbench-cli --scale-factor 10 --mb-per-file 256 --output-dir sf10-parquet
```

#### Generate Data Directly to S3

You can generate data directly to Amazon S3 or S3-compatible storage by providing an S3 URI as the output directory:

```bash
# Set AWS credentials
export AWS_ACCESS_KEY_ID="your-access-key"
export AWS_SECRET_ACCESS_KEY="your-secret-key"
export AWS_REGION="us-west-2" # Must match your bucket's region

# Generate to S3
spatialbench-cli --scale-factor 10 --mb-per-file 256 --output-dir s3://my-bucket/spatialbench/sf10

# For S3-compatible services (MinIO, etc.)
export AWS_ENDPOINT="http://localhost:9000"
spatialbench-cli --scale-factor 1 --output-dir s3://my-bucket/data
```

The S3 writer uses streaming multipart upload, buffering data in 32MB chunks before uploading parts. This ensures memory-efficient generation even for large datasets. All output formats (Parquet, CSV, TBL) are supported, and the generated files are byte-for-byte identical to local generation.

#### Custom Spider Configuration

You can override these defaults at runtime by passing a YAML file via the `--config` flag:
Expand Down
3 changes: 2 additions & 1 deletion spatialbench-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ serde = { version = "1.0.219", features = ["derive"] }
anyhow = "1.0.99"
serde_yaml = "0.9.33"
datafusion = "50.2"
object_store = { version = "0.12.4", features = ["http"] }
object_store = { version = "0.12.4", features = ["http", "aws"] }
arrow-array = "56"
arrow-schema = "56"
url = "2.5.7"
bytes = "1.10.1"

[dev-dependencies]
assert_cmd = "2.0"
Expand Down
21 changes: 14 additions & 7 deletions spatialbench-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ mod output_plan;
mod parquet;
mod plan;
mod runner;
mod s3_writer;
mod spatial_config_file;
mod statistics;
mod tbl;
Expand Down Expand Up @@ -252,8 +253,9 @@ impl Cli {
debug!("Logging configured from environment variables");
}

// Create output directory if it doesn't exist and we are not writing to stdout.
if !self.stdout {
// Create output directory if it doesn't exist and we are not writing to stdout
// or to S3 (where local directories are meaningless).
if !self.stdout && !self.output_dir.to_string_lossy().starts_with("s3://") {
fs::create_dir_all(&self.output_dir)?;
}

Expand Down Expand Up @@ -386,21 +388,26 @@ impl Cli {
}
}

impl IntoSize for BufWriter<Stdout> {
fn into_size(self) -> Result<usize, io::Error> {
// we can't get the size of stdout, so just return 0
impl AsyncFinalize for BufWriter<Stdout> {
async fn finalize(self) -> Result<usize, io::Error> {
Ok(0)
}
}

impl IntoSize for BufWriter<File> {
fn into_size(self) -> Result<usize, io::Error> {
impl AsyncFinalize for BufWriter<File> {
async fn finalize(self) -> Result<usize, io::Error> {
let file = self.into_inner()?;
let metadata = file.metadata()?;
Ok(metadata.len() as usize)
}
}

impl AsyncFinalize for s3_writer::S3Writer {
async fn finalize(self) -> Result<usize, io::Error> {
self.finish().await
}
}

/// Wrapper around a buffer writer that counts the number of buffers and bytes written
struct WriterSink<W: Write> {
statistics: WriteStatistics,
Expand Down
69 changes: 58 additions & 11 deletions spatialbench-cli/src/output_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,33 @@
//! * [`OutputPlanGenerator`]: plans the output files to be generated

use crate::plan::GenerationPlan;
use crate::s3_writer::{build_s3_client, parse_s3_uri};
use crate::{OutputFormat, Table};
use log::debug;
use object_store::ObjectStore;
use parquet::basic::Compression;
use std::collections::HashSet;
use std::fmt::{Display, Formatter};
use std::io;
use std::path::PathBuf;
use std::sync::Arc;

/// Where a partition will be output
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone)]
pub enum OutputLocation {
/// Output to a file
File(PathBuf),
/// Output to stdout
Stdout,
/// Output to S3 with a shared client
S3 {
/// The full S3 URI for this object (e.g. `s3://bucket/path/to/file.parquet`)
uri: String,
/// The object path within the bucket (e.g. `path/to/file.parquet`)
path: String,
/// Shared S3 client for the bucket
client: Arc<dyn ObjectStore>,
},
}

impl Display for OutputLocation {
Expand All @@ -48,12 +60,13 @@ impl Display for OutputLocation {
write!(f, "{}", file.to_string_lossy())
}
OutputLocation::Stdout => write!(f, "Stdout"),
OutputLocation::S3 { uri, .. } => write!(f, "{}", uri),
}
}
}

/// Describes an output partition (file) that will be generated
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone)]
pub struct OutputPlan {
/// The table
table: Table,
Expand Down Expand Up @@ -151,6 +164,8 @@ pub struct OutputPlanGenerator {
/// Output directories that have been created so far
/// (used to avoid creating the same directory multiple times)
created_directories: HashSet<PathBuf>,
/// Shared S3 client, lazily created on first S3 output location
s3_client: Option<Arc<dyn ObjectStore>>,
}

impl OutputPlanGenerator {
Expand All @@ -171,6 +186,7 @@ impl OutputPlanGenerator {
output_dir,
output_plans: Vec::new(),
created_directories: HashSet::new(),
s3_client: None,
}
}

Expand Down Expand Up @@ -282,17 +298,48 @@ impl OutputPlanGenerator {
OutputFormat::Parquet => "parquet",
};

let mut output_path = self.output_dir.clone();
if let Some(part) = part {
// If a partition is specified, create a subdirectory for it
output_path.push(table.to_string());
self.ensure_directory_exists(&output_path)?;
output_path.push(format!("{table}.{part}.{extension}"));
// Check if output_dir is an S3 URI
let output_dir_str = self.output_dir.to_string_lossy();
if output_dir_str.starts_with("s3://") {
// Handle S3 path
let base_uri = output_dir_str.trim_end_matches('/');
let s3_uri = if let Some(part) = part {
format!("{base_uri}/{table}/{table}.{part}.{extension}")
} else {
format!("{base_uri}/{table}.{extension}")
};

// Lazily build the S3 client on first use, then reuse it
let client = if let Some(ref client) = self.s3_client {
Arc::clone(client)
} else {
let (bucket, _) = parse_s3_uri(&s3_uri)?;
let client = build_s3_client(&bucket)?;
self.s3_client = Some(Arc::clone(&client));
client
};

let (_, path) = parse_s3_uri(&s3_uri)?;

Ok(OutputLocation::S3 {
uri: s3_uri,
path,
client,
})
} else {
// No partition specified, output to a single file
output_path.push(format!("{table}.{extension}"));
// Handle local filesystem path
let mut output_path = self.output_dir.clone();
if let Some(part) = part {
// If a partition is specified, create a subdirectory for it
output_path.push(table.to_string());
self.ensure_directory_exists(&output_path)?;
output_path.push(format!("{table}.{part}.{extension}"));
} else {
// No partition specified, output to a single file
output_path.push(format!("{table}.{extension}"));
}
Ok(OutputLocation::File(output_path))
}
Ok(OutputLocation::File(output_path))
}
}

Expand Down
31 changes: 22 additions & 9 deletions spatialbench-cli/src/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,17 @@ use std::io::Write;
use std::sync::Arc;
use tokio::sync::mpsc::{Receiver, Sender};

pub trait IntoSize {
/// Convert the object into a size
fn into_size(self) -> Result<usize, io::Error>;
/// Finalize a writer after all Parquet data has been written.
///
/// This is called from the async context (outside `spawn_blocking`) so
/// that implementations like [`S3Writer`](crate::s3_writer::S3Writer) can
/// `.await` their upload without competing with the tokio runtime for
/// threads — avoiding deadlocks under concurrent plans.
///
/// For local files and stdout the implementation is trivially synchronous.
pub trait AsyncFinalize: Write + Send + 'static {
/// Finalize the writer and return the total bytes written.
fn finalize(self) -> impl std::future::Future<Output = Result<usize, io::Error>> + Send;
}

/// Converts a set of RecordBatchIterators into a Parquet file
Expand All @@ -44,7 +52,7 @@ pub trait IntoSize {
///
/// Note the input is an iterator of [`RecordBatchIterator`]; The batches
/// produced by each iterator is encoded as its own row group.
pub async fn generate_parquet<W: Write + Send + IntoSize + 'static, I>(
pub async fn generate_parquet<W: AsyncFinalize, I>(
writer: W,
iter_iter: I,
num_threads: usize,
Expand Down Expand Up @@ -119,9 +127,8 @@ where
row_group_writer.close().unwrap();
statistics.increment_chunks(1);
}
let size = writer.into_inner()?.into_size()?;
statistics.increment_bytes(size);
Ok(()) as Result<(), io::Error>
let inner = writer.into_inner()?;
Ok((inner, statistics)) as Result<(W, WriteStatistics), io::Error>
});

// now, drive the input stream and send results to the writer task
Expand All @@ -135,8 +142,14 @@ where
// signal the writer task that we are done
drop(tx);

// Wait for the writer task to finish
writer_task.await??;
// Wait for the blocking writer task to return the underlying writer
let (inner, mut statistics) = writer_task.await??;

// Finalize in the async context so S3 uploads can .await without
// competing for tokio runtime threads (prevents deadlock under
// concurrent plans).
let size = inner.finalize().await?;
statistics.increment_bytes(size);

Ok(())
}
Expand Down
12 changes: 11 additions & 1 deletion spatialbench-cli/src/plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,16 @@ pub struct GenerationPlan {

pub const DEFAULT_PARQUET_ROW_GROUP_BYTES: i64 = 128 * 1024 * 1024;

/// Buffer size for Parquet writing (32MB)
///
/// This buffer size is used for:
/// - Local file writing with BufWriter
/// - S3 multipart upload parts
///
/// The 32MB size provides good performance and is well above the AWS S3
/// minimum part size requirement of 5MB for multipart uploads.
pub const PARQUET_BUFFER_SIZE: usize = 32 * 1024 * 1024;

impl GenerationPlan {
/// Returns a GenerationPlan number of parts to generate
///
Expand Down Expand Up @@ -207,7 +217,7 @@ impl GenerationPlan {
})
}

/// Return the number of part(ititions) this plan will generate
/// Return the number of part(ition)s this plan will generate
pub fn chunk_count(&self) -> usize {
self.part_list.clone().count()
}
Expand Down
17 changes: 15 additions & 2 deletions spatialbench-cli/src/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::csv::*;
use crate::generate::{generate_in_chunks, Source};
use crate::output_plan::{OutputLocation, OutputPlan};
use crate::parquet::generate_parquet;
use crate::s3_writer::S3Writer;
use crate::tbl::*;
use crate::{OutputFormat, Table, WriterSink};
use log::{debug, info};
Expand All @@ -32,6 +33,7 @@ use spatialbench_arrow::{
};
use std::io;
use std::io::BufWriter;
use std::sync::Arc;
use tokio::task::{JoinError, JoinSet};

/// Runs multiple [`OutputPlan`]s in parallel, managing the number of threads
Expand Down Expand Up @@ -218,6 +220,12 @@ where
})?;
Ok(())
}
OutputLocation::S3 { uri, path, client } => {
info!("Writing to S3: {}", uri);
let s3_writer = S3Writer::with_client(Arc::clone(client), path);
let sink = WriterSink::new(s3_writer);
generate_in_chunks(sink, sources, num_threads).await
}
}
}

Expand All @@ -228,7 +236,7 @@ where
{
match plan.output_location() {
OutputLocation::Stdout => {
let writer = BufWriter::with_capacity(32 * 1024 * 1024, io::stdout()); // 32MB buffer
let writer = BufWriter::with_capacity(crate::plan::PARQUET_BUFFER_SIZE, io::stdout());
generate_parquet(writer, sources, num_threads, plan.parquet_compression()).await
}
OutputLocation::File(path) => {
Expand All @@ -242,7 +250,7 @@ where
let file = std::fs::File::create(&temp_path).map_err(|err| {
io::Error::other(format!("Failed to create {temp_path:?}: {err}"))
})?;
let writer = BufWriter::with_capacity(32 * 1024 * 1024, file); // 32MB buffer
let writer = BufWriter::with_capacity(crate::plan::PARQUET_BUFFER_SIZE, file);
generate_parquet(writer, sources, num_threads, plan.parquet_compression()).await?;
// rename the temp file to the final path
std::fs::rename(&temp_path, path).map_err(|e| {
Expand All @@ -252,6 +260,11 @@ where
})?;
Ok(())
}
OutputLocation::S3 { uri, path, client } => {
info!("Writing parquet to S3: {}", uri);
let s3_writer = S3Writer::with_client(Arc::clone(client), path);
generate_parquet(s3_writer, sources, num_threads, plan.parquet_compression()).await
}
}
}

Expand Down
Loading