realpython · brendaweles · Feb 19, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 19, 2025
diff --git a/polars-lazyframe/README.md b/polars-lazyframe/README.md
@@ -0,0 +1,11 @@
+# How to Work With Polars LazyFrames
+
+This folder contains completed notebooks and other files used in the Real Python tutorial [How to Work With Polars LazyFrames](https://realpython.com/how-to-work-with-polars-lazyframe/). 
+
+**The following files are included:**
+
+- `tutorial_code.ipynb` is a Jupyter Notebook containing all the code used in the tutorial.
+- `rides.parquet` contains taxi fare data used throughout the tutorial. [See below]
+- `taxi_rides_nnGB.py` are a series of Python scripts that will generate a local copy of the `2021_Yellow_Taxi_Trip_Data.csv` in different sizes. This is used to illustrate data streaming and is provided to help demonstrate streaming.
+- `programming_languages.csv` is a small file used to illustrate the conversion between a DataFrame and a LazyFrame.
+- `dataframe_timer.py` and `lazyframe_timer.py` are two scripts used to compare the speed differences when using both a DataFrame and LazyFrame to perform an analysis.
diff --git a/polars-lazyframe/dataframe_timer.py b/polars-lazyframe/dataframe_timer.py
@@ -0,0 +1,22 @@
+import time
+
+import polars as pl
+
+start = time.perf_counter()
+
+for _ in range(10):
+    rides = pl.read_parquet("rides.parquet")
+    result = (
+        rides.filter(pl.col("pick_up") == pl.col("drop_off"))
+        .group_by(pl.col("pick_up"))
+        .agg(pl.col("fare").mean())
+        .filter(
+            pl.col("pick_up").is_in(
+                ["Brooklyn", "Bronx", "Queens", "Manhattan"]
+            )
+        )
+    )
+
+end = time.perf_counter()
+
+f"Code finished in {(end - start)/10:0.4f} seconds."
diff --git a/polars-lazyframe/lazyframe_timer.py b/polars-lazyframe/lazyframe_timer.py
@@ -0,0 +1,22 @@
+import time
+
+import polars as pl
+
+start = time.perf_counter()
+
+for _ in range(10):
+    rides = pl.scan_parquet("rides.parquet")
+    result = (
+        rides.filter(pl.col("pick_up") == pl.col("drop_off"))
+        .group_by(pl.col("pick_up"))
+        .agg(pl.col("fare").mean())
+        .filter(
+            pl.col("pick_up").is_in(
+                ["Brooklyn", "Bronx", "Queens", "Manhattan"]
+            )
+        )
+    ).collect()
+
+end = time.perf_counter()
+
+f"Code finished in {(end - start)/10:0.4f} seconds."
diff --git a/polars-lazyframe/programming_languages.csv b/polars-lazyframe/programming_languages.csv
@@ -0,0 +1,6 @@
+language_id,language,creator,year
+0,Pascal,Niklaus Wirth,1970
+1,C,Dennis Ritchie,1973
+2,C++,Bjarne Stroustrup,1985
+3,Python,Guido van Rossum,1991
+4,Java,James Gosling,1995
diff --git a/polars-lazyframe/rides.parquet b/polars-lazyframe/rides.parquet
diff --git a/polars-lazyframe/taxi_rides_16GB.py b/polars-lazyframe/taxi_rides_16GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+    lower_bound = datetime.datetime(
+        year=2021, month=1, day=1, hour=0, minute=0, second=0
+    )
+    upper_bound = datetime.datetime(
+        year=2021, month=12, day=31, hour=23, minute=59, second=59
+    )
+    random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+    return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+    random_dropoff = pick_up + datetime.timedelta(
+        minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+    )
+    return random_dropoff
+
+
+def generate_choice(*kwargs):
+    return random.choice(kwargs)
+
+
+def generate_file():
+    with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+        header = (
+            "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+            "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+            "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+            "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+        ).split(",")
+
+        writer = csv.writer(csvfile)
+        writer.writerow(header)
+
+        for _ in range(170_000_000):
+            fare_amount = round(random.uniform(5.0, 100.0), 2)
+            extra = generate_choice(0, 0.5, 3)
+            mta_tax = generate_choice(0, 0.5)
+            tip_amount = round(random.uniform(0.0, 20.0), 2)
+            tolls_amount = generate_choice(0, 6.12)
+            improvement_surcharge = 0.3
+            congestion_surcharge = 2.5
+            total_amount = round(
+                fare_amount
+                + extra
+                + mta_tax
+                + tip_amount
+                + tolls_amount
+                + improvement_surcharge
+                + congestion_surcharge,
+                2,
+            )
+
+            pick_up = generate_pu()
+
+            ride = [
+                random.randint(0, 6),  # VendorID
+                pick_up,  # tpep_pickup_datetime
+                generate_do(pick_up),  # tpep_dropoff_datetime
+                random.randint(0, 5),  # passenger_count
+                random.randint(0, 15),  # trip_distance
+                random.randint(0, 6),  # RatecodeID
+                generate_choice("Y", "N"),  # store_and_fwd_flag
+                random.randint(1, 265),  # PULocationID
+                random.randint(1, 265),  # DOLocationID
+                random.randint(0, 5),  # payment_type
+                fare_amount,  # fare_amount
+                extra,  # extra
+                mta_tax,  # mta_tax
+                tip_amount,  # tip_amount
+                tolls_amount,  # tolls_amount
+                improvement_surcharge,  # improvement_surcharge
+                total_amount,  # total_amount
+                congestion_surcharge,  # congestion_surcharge
+            ]
+
+            writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/taxi_rides_24GB.py b/polars-lazyframe/taxi_rides_24GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+    lower_bound = datetime.datetime(
+        year=2021, month=1, day=1, hour=0, minute=0, second=0
+    )
+    upper_bound = datetime.datetime(
+        year=2021, month=12, day=31, hour=23, minute=59, second=59
+    )
+    random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+    return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+    random_dropoff = pick_up + datetime.timedelta(
+        minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+    )
+    return random_dropoff
+
+
+def generate_choice(*kwargs):
+    return random.choice(kwargs)
+
+
+def generate_file():
+    with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+        header = (
+            "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+            "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+            "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+            "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+        ).split(",")
+
+        writer = csv.writer(csvfile)
+        writer.writerow(header)
+
+        for _ in range(255_000_000):
+            fare_amount = round(random.uniform(5.0, 100.0), 2)
+            extra = generate_choice(0, 0.5, 3)
+            mta_tax = generate_choice(0, 0.5)
+            tip_amount = round(random.uniform(0.0, 20.0), 2)
+            tolls_amount = generate_choice(0, 6.12)
+            improvement_surcharge = 0.3
+            congestion_surcharge = 2.5
+            total_amount = round(
+                fare_amount
+                + extra
+                + mta_tax
+                + tip_amount
+                + tolls_amount
+                + improvement_surcharge
+                + congestion_surcharge,
+                2,
+            )
+
+            pick_up = generate_pu()
+
+            ride = [
+                random.randint(0, 6),  # VendorID
+                pick_up,  # tpep_pickup_datetime
+                generate_do(pick_up),  # tpep_dropoff_datetime
+                random.randint(0, 5),  # passenger_count
+                random.randint(0, 15),  # trip_distance
+                random.randint(0, 6),  # RatecodeID
+                generate_choice("Y", "N"),  # store_and_fwd_flag
+                random.randint(1, 265),  # PULocationID
+                random.randint(1, 265),  # DOLocationID
+                random.randint(0, 5),  # payment_type
+                fare_amount,  # fare_amount
+                extra,  # extra
+                mta_tax,  # mta_tax
+                tip_amount,  # tip_amount
+                tolls_amount,  # tolls_amount
+                improvement_surcharge,  # improvement_surcharge
+                total_amount,  # total_amount
+                congestion_surcharge,  # congestion_surcharge
+            ]
+
+            writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/taxi_rides_32GB.py b/polars-lazyframe/taxi_rides_32GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+    lower_bound = datetime.datetime(
+        year=2021, month=1, day=1, hour=0, minute=0, second=0
+    )
+    upper_bound = datetime.datetime(
+        year=2021, month=12, day=31, hour=23, minute=59, second=59
+    )
+    random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+    return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+    random_dropoff = pick_up + datetime.timedelta(
+        minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+    )
+    return random_dropoff
+
+
+def generate_choice(*kwargs):
+    return random.choice(kwargs)
+
+
+def generate_file():
+    with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+        header = (
+            "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+            "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+            "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+            "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+        ).split(",")
+
+        writer = csv.writer(csvfile)
+        writer.writerow(header)
+
+        for _ in range(340_000_000):
+            fare_amount = round(random.uniform(5.0, 100.0), 2)
+            extra = generate_choice(0, 0.5, 3)
+            mta_tax = generate_choice(0, 0.5)
+            tip_amount = round(random.uniform(0.0, 20.0), 2)
+            tolls_amount = generate_choice(0, 6.12)
+            improvement_surcharge = 0.3
+            congestion_surcharge = 2.5
+            total_amount = round(
+                fare_amount
+                + extra
+                + mta_tax
+                + tip_amount
+                + tolls_amount
+                + improvement_surcharge
+                + congestion_surcharge,
+                2,
+            )
+
+            pick_up = generate_pu()
+
+            ride = [
+                random.randint(0, 6),  # VendorID
+                pick_up,  # tpep_pickup_datetime
+                generate_do(pick_up),  # tpep_dropoff_datetime
+                random.randint(0, 5),  # passenger_count
+                random.randint(0, 15),  # trip_distance
+                random.randint(0, 6),  # RatecodeID
+                generate_choice("Y", "N"),  # store_and_fwd_flag
+                random.randint(1, 265),  # PULocationID
+                random.randint(1, 265),  # DOLocationID
+                random.randint(0, 5),  # payment_type
+                fare_amount,  # fare_amount
+                extra,  # extra
+                mta_tax,  # mta_tax
+                tip_amount,  # tip_amount
+                tolls_amount,  # tolls_amount
+                improvement_surcharge,  # improvement_surcharge
+                total_amount,  # total_amount
+                congestion_surcharge,  # congestion_surcharge
+            ]
+
+            writer.writerow(ride)
+
+
+generate_file()