fix: use example sample for demo

clflushopt · clflushopt · commit 29054dfcab0d · 2025-02-01T16:29:59.000-05:00
diff --git a/glint/src/main/java/co/clflushopt/glint/App.java b/glint/src/main/java/co/clflushopt/glint/App.java
@@ -1,10 +1,14 @@
 package co.clflushopt.glint;
 
 import java.io.FileNotFoundException;
+import java.util.Arrays;
 import java.util.Iterator;
+import java.util.List;
+import java.util.Optional;
 
 import org.apache.arrow.vector.types.pojo.ArrowType;
 
+import co.clflushopt.glint.core.CsvReaderOptions;
 import co.clflushopt.glint.core.ExecutionContext;
 import co.clflushopt.glint.dataframe.DataFrame;
 import co.clflushopt.glint.query.logical.expr.AggregateExpr;
@@ -13,7 +17,10 @@
 import co.clflushopt.glint.query.logical.expr.LogicalExpr;
 import co.clflushopt.glint.query.logical.plan.LogicalPlan;
 import co.clflushopt.glint.query.optimizer.QueryOptimizer;
+import co.clflushopt.glint.types.ArrowTypes;
+import co.clflushopt.glint.types.Field;
 import co.clflushopt.glint.types.RecordBatch;
+import co.clflushopt.glint.types.Schema;
 
 /**
  * Hello world!
@@ -35,9 +42,32 @@ public static void nycTripsBenchmark(String[] args) throws FileNotFoundException
 
         long startTime = System.currentTimeMillis();
         try {
-
+            // Define the schema for NYC Taxi dataset
+            Schema schema = new Schema(Arrays.asList(new Field("VendorID", ArrowTypes.Int32Type),
+                    new Field("tpep_pickup_datetime", ArrowTypes.StringType), // Could be Timestamp
+                    new Field("tpep_dropoff_datetime", ArrowTypes.StringType), // Could be Timestamp
+                    new Field("passenger_count", ArrowTypes.Int32Type),
+                    new Field("trip_distance", ArrowTypes.DoubleType),
+                    new Field("pickup_longitude", ArrowTypes.DoubleType),
+                    new Field("pickup_latitude", ArrowTypes.DoubleType),
+                    new Field("RatecodeID", ArrowTypes.Int32Type),
+                    new Field("store_and_fwd_flag", ArrowTypes.StringType),
+                    new Field("dropoff_longitude", ArrowTypes.DoubleType),
+                    new Field("dropoff_latitude", ArrowTypes.DoubleType),
+                    new Field("payment_type", ArrowTypes.Int32Type),
+                    new Field("fare_amount", ArrowTypes.DoubleType),
+                    new Field("extra", ArrowTypes.DoubleType),
+                    new Field("mta_tax", ArrowTypes.DoubleType),
+                    new Field("tip_amount", ArrowTypes.DoubleType),
+                    new Field("tolls_amount", ArrowTypes.DoubleType),
+                    new Field("improvement_surcharge", ArrowTypes.DoubleType),
+                    new Field("total_amount", ArrowTypes.DoubleType)));
             // Create DataFrame and apply transformations
-            DataFrame df = ctx.readParquet("./datasets/yellow_tripdata_2019-01.parquet", null);
+            DataFrame df = ctx
+                    .readCsv("./datasets/yellow_tripdata_example.csv", Optional.of(schema),
+                            CsvReaderOptions.builder().delimiter(',').hasHeader(true).build())
+                    .aggregate(List.of(col("passenger_count")),
+                            List.of(max(cast(col("fare_amount"), ArrowTypes.FloatType))));
 
             System.out.println("Logical Plan:\t" + LogicalPlan.format(df.getLogicalPlan()));
             System.out.println("Schema:\t" + df.getSchema());
diff --git a/glint/src/main/java/co/clflushopt/glint/query/physical/plan/HashAggregateOperator.java b/glint/src/main/java/co/clflushopt/glint/query/physical/plan/HashAggregateOperator.java
@@ -60,7 +60,6 @@ public Iterator<RecordBatch> execute() {
         Iterator<RecordBatch> inputIter = input.execute();
         while (inputIter.hasNext()) {
             RecordBatch batch = inputIter.next();
-
             // Evaluate grouping expressions
             List<ColumnVector> groupKeys = groupByExpr.stream().map(expr -> expr.eval(batch))
                     .collect(Collectors.toList());