diff --git a/polars-lazyframe/README.md b/polars-lazyframe/README.md
new file mode 100644
index 0000000000..5a4e876aa5
--- /dev/null
+++ b/polars-lazyframe/README.md
@@ -0,0 +1,11 @@
+# How to Work With Polars LazyFrames
+
+This folder contains completed notebooks and other files used in the Real Python tutorial [How to Work With Polars LazyFrames](https://realpython.com/how-to-work-with-polars-lazyframe/).
+
+**The following files are included:**
+
+- `tutorial_code.ipynb` is a Jupyter Notebook containing all the code used in the tutorial.
+- `rides.parquet` contains taxi fare data used throughout the tutorial. [See below]
+- `taxi_rides_nnGB.py` are a series of Python scripts that will generate a local copy of the `2021_Yellow_Taxi_Trip_Data.csv` in different sizes. This is used to illustrate data streaming and is provided to help demonstrate streaming.
+- `programming_languages.csv` is a small file used to illustrate the conversion between a DataFrame and a LazyFrame.
+- `dataframe_timer.py` and `lazyframe_timer.py` are two scripts used to compare the speed differences when using both a DataFrame and LazyFrame to perform an analysis.
diff --git a/polars-lazyframe/dataframe_timer.py b/polars-lazyframe/dataframe_timer.py
new file mode 100644
index 0000000000..17eef0c961
--- /dev/null
+++ b/polars-lazyframe/dataframe_timer.py
@@ -0,0 +1,22 @@
+import time
+
+import polars as pl
+
+start = time.perf_counter()
+
+for _ in range(10):
+ rides = pl.read_parquet("rides.parquet")
+ result = (
+ rides.filter(pl.col("pick_up") == pl.col("drop_off"))
+ .group_by(pl.col("pick_up"))
+ .agg(pl.col("fare").mean())
+ .filter(
+ pl.col("pick_up").is_in(
+ ["Brooklyn", "Bronx", "Queens", "Manhattan"]
+ )
+ )
+ )
+
+end = time.perf_counter()
+
+f"Code finished in {(end - start)/10:0.4f} seconds."
diff --git a/polars-lazyframe/lazyframe_timer.py b/polars-lazyframe/lazyframe_timer.py
new file mode 100644
index 0000000000..643b5f9517
--- /dev/null
+++ b/polars-lazyframe/lazyframe_timer.py
@@ -0,0 +1,22 @@
+import time
+
+import polars as pl
+
+start = time.perf_counter()
+
+for _ in range(10):
+ rides = pl.scan_parquet("rides.parquet")
+ result = (
+ rides.filter(pl.col("pick_up") == pl.col("drop_off"))
+ .group_by(pl.col("pick_up"))
+ .agg(pl.col("fare").mean())
+ .filter(
+ pl.col("pick_up").is_in(
+ ["Brooklyn", "Bronx", "Queens", "Manhattan"]
+ )
+ )
+ ).collect()
+
+end = time.perf_counter()
+
+f"Code finished in {(end - start)/10:0.4f} seconds."
diff --git a/polars-lazyframe/programming_languages.csv b/polars-lazyframe/programming_languages.csv
new file mode 100644
index 0000000000..9adf04219b
--- /dev/null
+++ b/polars-lazyframe/programming_languages.csv
@@ -0,0 +1,6 @@
+language_id,language,creator,year
+0,Pascal,Niklaus Wirth,1970
+1,C,Dennis Ritchie,1973
+2,C++,Bjarne Stroustrup,1985
+3,Python,Guido van Rossum,1991
+4,Java,James Gosling,1995
diff --git a/polars-lazyframe/rides.parquet b/polars-lazyframe/rides.parquet
new file mode 100644
index 0000000000..7bf6890b90
Binary files /dev/null and b/polars-lazyframe/rides.parquet differ
diff --git a/polars-lazyframe/taxi_rides_16GB.py b/polars-lazyframe/taxi_rides_16GB.py
new file mode 100644
index 0000000000..a2fac947fb
--- /dev/null
+++ b/polars-lazyframe/taxi_rides_16GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+ lower_bound = datetime.datetime(
+ year=2021, month=1, day=1, hour=0, minute=0, second=0
+ )
+ upper_bound = datetime.datetime(
+ year=2021, month=12, day=31, hour=23, minute=59, second=59
+ )
+ random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+ return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+ random_dropoff = pick_up + datetime.timedelta(
+ minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+ )
+ return random_dropoff
+
+
+def generate_choice(*kwargs):
+ return random.choice(kwargs)
+
+
+def generate_file():
+ with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+ header = (
+ "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+ "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+ "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+ "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+ ).split(",")
+
+ writer = csv.writer(csvfile)
+ writer.writerow(header)
+
+ for _ in range(170_000_000):
+ fare_amount = round(random.uniform(5.0, 100.0), 2)
+ extra = generate_choice(0, 0.5, 3)
+ mta_tax = generate_choice(0, 0.5)
+ tip_amount = round(random.uniform(0.0, 20.0), 2)
+ tolls_amount = generate_choice(0, 6.12)
+ improvement_surcharge = 0.3
+ congestion_surcharge = 2.5
+ total_amount = round(
+ fare_amount
+ + extra
+ + mta_tax
+ + tip_amount
+ + tolls_amount
+ + improvement_surcharge
+ + congestion_surcharge,
+ 2,
+ )
+
+ pick_up = generate_pu()
+
+ ride = [
+ random.randint(0, 6), # VendorID
+ pick_up, # tpep_pickup_datetime
+ generate_do(pick_up), # tpep_dropoff_datetime
+ random.randint(0, 5), # passenger_count
+ random.randint(0, 15), # trip_distance
+ random.randint(0, 6), # RatecodeID
+ generate_choice("Y", "N"), # store_and_fwd_flag
+ random.randint(1, 265), # PULocationID
+ random.randint(1, 265), # DOLocationID
+ random.randint(0, 5), # payment_type
+ fare_amount, # fare_amount
+ extra, # extra
+ mta_tax, # mta_tax
+ tip_amount, # tip_amount
+ tolls_amount, # tolls_amount
+ improvement_surcharge, # improvement_surcharge
+ total_amount, # total_amount
+ congestion_surcharge, # congestion_surcharge
+ ]
+
+ writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/taxi_rides_24GB.py b/polars-lazyframe/taxi_rides_24GB.py
new file mode 100644
index 0000000000..95efb7fc41
--- /dev/null
+++ b/polars-lazyframe/taxi_rides_24GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+ lower_bound = datetime.datetime(
+ year=2021, month=1, day=1, hour=0, minute=0, second=0
+ )
+ upper_bound = datetime.datetime(
+ year=2021, month=12, day=31, hour=23, minute=59, second=59
+ )
+ random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+ return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+ random_dropoff = pick_up + datetime.timedelta(
+ minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+ )
+ return random_dropoff
+
+
+def generate_choice(*kwargs):
+ return random.choice(kwargs)
+
+
+def generate_file():
+ with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+ header = (
+ "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+ "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+ "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+ "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+ ).split(",")
+
+ writer = csv.writer(csvfile)
+ writer.writerow(header)
+
+ for _ in range(255_000_000):
+ fare_amount = round(random.uniform(5.0, 100.0), 2)
+ extra = generate_choice(0, 0.5, 3)
+ mta_tax = generate_choice(0, 0.5)
+ tip_amount = round(random.uniform(0.0, 20.0), 2)
+ tolls_amount = generate_choice(0, 6.12)
+ improvement_surcharge = 0.3
+ congestion_surcharge = 2.5
+ total_amount = round(
+ fare_amount
+ + extra
+ + mta_tax
+ + tip_amount
+ + tolls_amount
+ + improvement_surcharge
+ + congestion_surcharge,
+ 2,
+ )
+
+ pick_up = generate_pu()
+
+ ride = [
+ random.randint(0, 6), # VendorID
+ pick_up, # tpep_pickup_datetime
+ generate_do(pick_up), # tpep_dropoff_datetime
+ random.randint(0, 5), # passenger_count
+ random.randint(0, 15), # trip_distance
+ random.randint(0, 6), # RatecodeID
+ generate_choice("Y", "N"), # store_and_fwd_flag
+ random.randint(1, 265), # PULocationID
+ random.randint(1, 265), # DOLocationID
+ random.randint(0, 5), # payment_type
+ fare_amount, # fare_amount
+ extra, # extra
+ mta_tax, # mta_tax
+ tip_amount, # tip_amount
+ tolls_amount, # tolls_amount
+ improvement_surcharge, # improvement_surcharge
+ total_amount, # total_amount
+ congestion_surcharge, # congestion_surcharge
+ ]
+
+ writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/taxi_rides_32GB.py b/polars-lazyframe/taxi_rides_32GB.py
new file mode 100644
index 0000000000..9d4f13ed02
--- /dev/null
+++ b/polars-lazyframe/taxi_rides_32GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+ lower_bound = datetime.datetime(
+ year=2021, month=1, day=1, hour=0, minute=0, second=0
+ )
+ upper_bound = datetime.datetime(
+ year=2021, month=12, day=31, hour=23, minute=59, second=59
+ )
+ random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+ return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+ random_dropoff = pick_up + datetime.timedelta(
+ minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+ )
+ return random_dropoff
+
+
+def generate_choice(*kwargs):
+ return random.choice(kwargs)
+
+
+def generate_file():
+ with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+ header = (
+ "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+ "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+ "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+ "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+ ).split(",")
+
+ writer = csv.writer(csvfile)
+ writer.writerow(header)
+
+ for _ in range(340_000_000):
+ fare_amount = round(random.uniform(5.0, 100.0), 2)
+ extra = generate_choice(0, 0.5, 3)
+ mta_tax = generate_choice(0, 0.5)
+ tip_amount = round(random.uniform(0.0, 20.0), 2)
+ tolls_amount = generate_choice(0, 6.12)
+ improvement_surcharge = 0.3
+ congestion_surcharge = 2.5
+ total_amount = round(
+ fare_amount
+ + extra
+ + mta_tax
+ + tip_amount
+ + tolls_amount
+ + improvement_surcharge
+ + congestion_surcharge,
+ 2,
+ )
+
+ pick_up = generate_pu()
+
+ ride = [
+ random.randint(0, 6), # VendorID
+ pick_up, # tpep_pickup_datetime
+ generate_do(pick_up), # tpep_dropoff_datetime
+ random.randint(0, 5), # passenger_count
+ random.randint(0, 15), # trip_distance
+ random.randint(0, 6), # RatecodeID
+ generate_choice("Y", "N"), # store_and_fwd_flag
+ random.randint(1, 265), # PULocationID
+ random.randint(1, 265), # DOLocationID
+ random.randint(0, 5), # payment_type
+ fare_amount, # fare_amount
+ extra, # extra
+ mta_tax, # mta_tax
+ tip_amount, # tip_amount
+ tolls_amount, # tolls_amount
+ improvement_surcharge, # improvement_surcharge
+ total_amount, # total_amount
+ congestion_surcharge, # congestion_surcharge
+ ]
+
+ writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/taxi_rides_48GB.py b/polars-lazyframe/taxi_rides_48GB.py
new file mode 100644
index 0000000000..b52ba09563
--- /dev/null
+++ b/polars-lazyframe/taxi_rides_48GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+ lower_bound = datetime.datetime(
+ year=2021, month=1, day=1, hour=0, minute=0, second=0
+ )
+ upper_bound = datetime.datetime(
+ year=2021, month=12, day=31, hour=23, minute=59, second=59
+ )
+ random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+ return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+ random_dropoff = pick_up + datetime.timedelta(
+ minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+ )
+ return random_dropoff
+
+
+def generate_choice(*kwargs):
+ return random.choice(kwargs)
+
+
+def generate_file():
+ with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+ header = (
+ "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+ "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+ "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+ "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+ ).split(",")
+
+ writer = csv.writer(csvfile)
+ writer.writerow(header)
+
+ for _ in range(510_000_000):
+ fare_amount = round(random.uniform(5.0, 100.0), 2)
+ extra = generate_choice(0, 0.5, 3)
+ mta_tax = generate_choice(0, 0.5)
+ tip_amount = round(random.uniform(0.0, 20.0), 2)
+ tolls_amount = generate_choice(0, 6.12)
+ improvement_surcharge = 0.3
+ congestion_surcharge = 2.5
+ total_amount = round(
+ fare_amount
+ + extra
+ + mta_tax
+ + tip_amount
+ + tolls_amount
+ + improvement_surcharge
+ + congestion_surcharge,
+ 2,
+ )
+
+ pick_up = generate_pu()
+
+ ride = [
+ random.randint(0, 6), # VendorID
+ pick_up, # tpep_pickup_datetime
+ generate_do(pick_up), # tpep_dropoff_datetime
+ random.randint(0, 5), # passenger_count
+ random.randint(0, 15), # trip_distance
+ random.randint(0, 6), # RatecodeID
+ generate_choice("Y", "N"), # store_and_fwd_flag
+ random.randint(1, 265), # PULocationID
+ random.randint(1, 265), # DOLocationID
+ random.randint(0, 5), # payment_type
+ fare_amount, # fare_amount
+ extra, # extra
+ mta_tax, # mta_tax
+ tip_amount, # tip_amount
+ tolls_amount, # tolls_amount
+ improvement_surcharge, # improvement_surcharge
+ total_amount, # total_amount
+ congestion_surcharge, # congestion_surcharge
+ ]
+
+ writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/taxi_rides_64GB.py b/polars-lazyframe/taxi_rides_64GB.py
new file mode 100644
index 0000000000..f625681269
--- /dev/null
+++ b/polars-lazyframe/taxi_rides_64GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+ lower_bound = datetime.datetime(
+ year=2021, month=1, day=1, hour=0, minute=0, second=0
+ )
+ upper_bound = datetime.datetime(
+ year=2021, month=12, day=31, hour=23, minute=59, second=59
+ )
+ random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+ return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+ random_dropoff = pick_up + datetime.timedelta(
+ minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+ )
+ return random_dropoff
+
+
+def generate_choice(*kwargs):
+ return random.choice(kwargs)
+
+
+def generate_file():
+ with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+ header = (
+ "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+ "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+ "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+ "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+ ).split(",")
+
+ writer = csv.writer(csvfile)
+ writer.writerow(header)
+
+ for _ in range(690_000_000):
+ fare_amount = round(random.uniform(5.0, 100.0), 2)
+ extra = generate_choice(0, 0.5, 3)
+ mta_tax = generate_choice(0, 0.5)
+ tip_amount = round(random.uniform(0.0, 20.0), 2)
+ tolls_amount = generate_choice(0, 6.12)
+ improvement_surcharge = 0.3
+ congestion_surcharge = 2.5
+ total_amount = round(
+ fare_amount
+ + extra
+ + mta_tax
+ + tip_amount
+ + tolls_amount
+ + improvement_surcharge
+ + congestion_surcharge,
+ 2,
+ )
+
+ pick_up = generate_pu()
+
+ ride = [
+ random.randint(0, 6), # VendorID
+ pick_up, # tpep_pickup_datetime
+ generate_do(pick_up), # tpep_dropoff_datetime
+ random.randint(0, 5), # passenger_count
+ random.randint(0, 15), # trip_distance
+ random.randint(0, 6), # RatecodeID
+ generate_choice("Y", "N"), # store_and_fwd_flag
+ random.randint(1, 265), # PULocationID
+ random.randint(1, 265), # DOLocationID
+ random.randint(0, 5), # payment_type
+ fare_amount, # fare_amount
+ extra, # extra
+ mta_tax, # mta_tax
+ tip_amount, # tip_amount
+ tolls_amount, # tolls_amount
+ improvement_surcharge, # improvement_surcharge
+ total_amount, # total_amount
+ congestion_surcharge, # congestion_surcharge
+ ]
+
+ writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/taxi_rides_80GB.py b/polars-lazyframe/taxi_rides_80GB.py
new file mode 100644
index 0000000000..4d9964e5f8
--- /dev/null
+++ b/polars-lazyframe/taxi_rides_80GB.py
@@ -0,0 +1,87 @@
+import csv
+import datetime
+import random
+
+random.seed(10)
+
+
+def generate_pu():
+ lower_bound = datetime.datetime(
+ year=2021, month=1, day=1, hour=0, minute=0, second=0
+ )
+ upper_bound = datetime.datetime(
+ year=2021, month=12, day=31, hour=23, minute=59, second=59
+ )
+ random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
+ return random_pickup.replace(microsecond=0)
+
+
+def generate_do(pick_up):
+ random_dropoff = pick_up + datetime.timedelta(
+ minutes=random.randint(10, 120), seconds=random.randint(0, 59)
+ )
+ return random_dropoff
+
+
+def generate_choice(*kwargs):
+ return random.choice(kwargs)
+
+
+def generate_file():
+ with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
+ header = (
+ "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
+ "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
+ "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
+ "improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
+ ).split(",")
+
+ writer = csv.writer(csvfile)
+ writer.writerow(header)
+
+ for _ in range(860_000_000):
+ fare_amount = round(random.uniform(5.0, 100.0), 2)
+ extra = generate_choice(0, 0.5, 3)
+ mta_tax = generate_choice(0, 0.5)
+ tip_amount = round(random.uniform(0.0, 20.0), 2)
+ tolls_amount = generate_choice(0, 6.12)
+ improvement_surcharge = 0.3
+ congestion_surcharge = 2.5
+ total_amount = round(
+ fare_amount
+ + extra
+ + mta_tax
+ + tip_amount
+ + tolls_amount
+ + improvement_surcharge
+ + congestion_surcharge,
+ 2,
+ )
+
+ pick_up = generate_pu()
+
+ ride = [
+ random.randint(0, 6), # VendorID
+ pick_up, # tpep_pickup_datetime
+ generate_do(pick_up), # tpep_dropoff_datetime
+ random.randint(0, 5), # passenger_count
+ random.randint(0, 15), # trip_distance
+ random.randint(0, 6), # RatecodeID
+ generate_choice("Y", "N"), # store_and_fwd_flag
+ random.randint(1, 265), # PULocationID
+ random.randint(1, 265), # DOLocationID
+ random.randint(0, 5), # payment_type
+ fare_amount, # fare_amount
+ extra, # extra
+ mta_tax, # mta_tax
+ tip_amount, # tip_amount
+ tolls_amount, # tolls_amount
+ improvement_surcharge, # improvement_surcharge
+ total_amount, # total_amount
+ congestion_surcharge, # congestion_surcharge
+ ]
+
+ writer.writerow(ride)
+
+
+generate_file()
diff --git a/polars-lazyframe/tutorial_code.ipynb b/polars-lazyframe/tutorial_code.ipynb
new file mode 100644
index 0000000000..cf68dfd493
--- /dev/null
+++ b/polars-lazyframe/tutorial_code.ipynb
@@ -0,0 +1,896 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "c0a043d2-2fe3-40bd-bf61-b6e12a218a42",
+ "metadata": {},
+ "source": [
+ "# Introduction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "554f934e-b66f-4633-bc93-8a51edaa25b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!python -m pip install polars\n",
+ "!python -m pip install matplotlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "ec44a544-7828-461e-b735-536f3fe6fd16",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
shape: (3_076_903, 5)| pick_up | drop_off | passengers | distance | fare |
|---|
| str | str | i32 | i32 | i32 |
| "Manhattan" | "Manhattan" | 1 | 3 | 24 |
| "Queens" | "Manhattan" | 1 | 19 | 75 |
| "Manhattan" | "Queens" | 1 | 1 | 16 |
| "Queens" | "Manhattan" | 0 | 9 | 60 |
| "Queens" | "Manhattan" | 1 | 17 | 90 |
| … | … | … | … | … |
| "Manhattan" | "Manhattan" | null | 5 | 27 |
| "Manhattan" | "Manhattan" | null | 4 | 26 |
| "Queens" | "Brooklyn" | null | 4 | 26 |
| "Manhattan" | "Manhattan" | null | 3 | 24 |
| "Manhattan" | "Manhattan" | null | 8 | 35 |
"
+ ],
+ "text/plain": [
+ "shape: (3_076_903, 5)\n",
+ "┌───────────┬───────────┬────────────┬──────────┬──────┐\n",
+ "│ pick_up ┆ drop_off ┆ passengers ┆ distance ┆ fare │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ i32 ┆ i32 ┆ i32 │\n",
+ "╞═══════════╪═══════════╪════════════╪══════════╪══════╡\n",
+ "│ Manhattan ┆ Manhattan ┆ 1 ┆ 3 ┆ 24 │\n",
+ "│ Queens ┆ Manhattan ┆ 1 ┆ 19 ┆ 75 │\n",
+ "│ Manhattan ┆ Queens ┆ 1 ┆ 1 ┆ 16 │\n",
+ "│ Queens ┆ Manhattan ┆ 0 ┆ 9 ┆ 60 │\n",
+ "│ Queens ┆ Manhattan ┆ 1 ┆ 17 ┆ 90 │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ Manhattan ┆ Manhattan ┆ null ┆ 5 ┆ 27 │\n",
+ "│ Manhattan ┆ Manhattan ┆ null ┆ 4 ┆ 26 │\n",
+ "│ Queens ┆ Brooklyn ┆ null ┆ 4 ┆ 26 │\n",
+ "│ Manhattan ┆ Manhattan ┆ null ┆ 3 ┆ 24 │\n",
+ "│ Manhattan ┆ Manhattan ┆ null ┆ 8 ┆ 35 │\n",
+ "└───────────┴───────────┴────────────┴──────────┴──────┘"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "rides = pl.scan_parquet(\"rides.parquet\")\n",
+ "\n",
+ "rides.collect()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c4665acc-d207-4a4c-bdc6-10f436e32a99",
+ "metadata": {},
+ "source": [
+ "# How to Create a Polars LazyFrame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "784b7616-643a-46bc-b449-0fbc5f390178",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 4)| language_id | language | creator | year |
|---|
| i64 | str | str | i64 |
| 0 | "Pascal" | "Niklaus Wirth" | 1970 |
| 1 | "C" | "Dennis Ritchie" | 1973 |
| 2 | "C++" | "Bjarne Stroustrup" | 1985 |
| 3 | "Python" | "Guido van Rossum" | 1991 |
| 4 | "Java" | "James Gosling" | 1995 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 4)\n",
+ "┌─────────────┬──────────┬───────────────────┬──────┐\n",
+ "│ language_id ┆ language ┆ creator ┆ year │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ str ┆ str ┆ i64 │\n",
+ "╞═════════════╪══════════╪═══════════════════╪══════╡\n",
+ "│ 0 ┆ Pascal ┆ Niklaus Wirth ┆ 1970 │\n",
+ "│ 1 ┆ C ┆ Dennis Ritchie ┆ 1973 │\n",
+ "│ 2 ┆ C++ ┆ Bjarne Stroustrup ┆ 1985 │\n",
+ "│ 3 ┆ Python ┆ Guido van Rossum ┆ 1991 │\n",
+ "│ 4 ┆ Java ┆ James Gosling ┆ 1995 │\n",
+ "└─────────────┴──────────┴───────────────────┴──────┘"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "programming_languages = {\n",
+ " \"language_id\": range(5),\n",
+ " \"language\": [\"Pascal\", \"C\", \"C++\", \"Python\", \"Java\"],\n",
+ " \"creator\": [\n",
+ " \"Niklaus Wirth\",\n",
+ " \"Dennis Ritchie\",\n",
+ " \"Bjarne Stroustrup\",\n",
+ " \"Guido van Rossum\",\n",
+ " \"James Gosling\",\n",
+ " ],\n",
+ " \"year\": [1970, 1973, 1985, 1991, 1995],\n",
+ "}\n",
+ "\n",
+ "lf = pl.LazyFrame(programming_languages)\n",
+ "lf.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "168d4db9-87c0-4ce2-aaab-35f128a03788",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 4)| language_id | language | creator | year |
|---|
| i32 | str | str | i32 |
| 0 | "Pascal" | "Niklaus Wirth" | 1970 |
| 1 | "C" | "Dennis Ritchie" | 1973 |
| 2 | "C++" | "Bjarne Stroustrup" | 1985 |
| 3 | "Python" | "Guido van Rossum" | 1991 |
| 4 | "Java" | "James Gosling" | 1995 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 4)\n",
+ "┌─────────────┬──────────┬───────────────────┬──────┐\n",
+ "│ language_id ┆ language ┆ creator ┆ year │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ i32 ┆ str ┆ str ┆ i32 │\n",
+ "╞═════════════╪══════════╪═══════════════════╪══════╡\n",
+ "│ 0 ┆ Pascal ┆ Niklaus Wirth ┆ 1970 │\n",
+ "│ 1 ┆ C ┆ Dennis Ritchie ┆ 1973 │\n",
+ "│ 2 ┆ C++ ┆ Bjarne Stroustrup ┆ 1985 │\n",
+ "│ 3 ┆ Python ┆ Guido van Rossum ┆ 1991 │\n",
+ "│ 4 ┆ Java ┆ James Gosling ┆ 1995 │\n",
+ "└─────────────┴──────────┴───────────────────┴──────┘"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "programming_languages = {\n",
+ " \"language_id\": range(5),\n",
+ " \"language\": [\"Pascal\", \"C\", \"C++\", \"Python\", \"Java\"],\n",
+ " \"creator\": [\n",
+ " \"Niklaus Wirth\",\n",
+ " \"Dennis Ritchie\",\n",
+ " \"Bjarne Stroustrup\",\n",
+ " \"Guido van Rossum\",\n",
+ " \"James Gosling\",\n",
+ " ],\n",
+ " \"year\": [1970, 1973, 1985, 1991, 1995],\n",
+ "}\n",
+ "\n",
+ "d_types = {\n",
+ " \"language_id\": pl.Int32,\n",
+ " \"language\": pl.String,\n",
+ " \"creator\": pl.String,\n",
+ " \"year\": pl.Int32,\n",
+ "}\n",
+ "\n",
+ "lf = pl.LazyFrame(programming_languages, schema=d_types)\n",
+ "lf.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "8ee12257-f59c-4166-ab6e-3802a5a4d9b9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Schema([('language_id', Int32),\n",
+ " ('language', String),\n",
+ " ('creator', String),\n",
+ " ('year', Int32)])"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "languages.collect_schema()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "66b4ea00-1afd-4220-92b8-2a99f81b5c30",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "languages_df = pl.read_csv(\"programming_languages.csv\")\n",
+ "type(languages_df)\n",
+ "\n",
+ "\n",
+ "languages_lf = languages_df.lazy()\n",
+ "type(languages_lf)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb75309e-777b-48ec-b340-f9233586741c",
+ "metadata": {},
+ "source": [
+ "# How to Monitor LazyFrame Efficiency"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "06aa3446-86ca-43f7-b88c-1ff8d53b0597",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# dataframe_timer.py\n",
+ "\n",
+ "import polars as pl\n",
+ "import time\n",
+ "\n",
+ "start = time.perf_counter()\n",
+ "\n",
+ "for _ in range(10):\n",
+ " rides = pl.read_parquet(\"rides.parquet\")\n",
+ " result = (\n",
+ " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
+ " .group_by(pl.col(\"pick_up\"))\n",
+ " .agg(pl.col(\"fare\").mean())\n",
+ " .filter(\n",
+ " pl.col(\"pick_up\").is_in(\n",
+ " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ "\n",
+ "end = time.perf_counter()\n",
+ "\n",
+ "f\"Code finished in {(end - start)/10:0.4f} seconds.\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "62c57ae3-7eae-4f61-948a-50e31df28eb9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# lazyframe_timer.py\n",
+ "\n",
+ "import polars as pl\n",
+ "import time\n",
+ "\n",
+ "start = time.perf_counter()\n",
+ "\n",
+ "for _ in range(10):\n",
+ " rides = pl.scan_parquet(\"rides.parquet\")\n",
+ " result = (\n",
+ " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
+ " .group_by(pl.col(\"pick_up\"))\n",
+ " .agg(pl.col(\"fare\").mean())\n",
+ " .filter(\n",
+ " pl.col(\"pick_up\").is_in(\n",
+ " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
+ " )\n",
+ " )\n",
+ " ).collect()\n",
+ "\n",
+ "end = time.perf_counter()\n",
+ "\n",
+ "f\"Code finished in {(end - start)/10:0.4f} seconds.\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "723df3df-44e0-40d6-a3d6-22797c37b994",
+ "metadata": {},
+ "source": [
+ "# How LazyFrames Achieve Efficiency\n",
+ "## Investigating the Sub-Optimized Query Plan"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "48a7ae3f-d906-4c96-b8c1-7bf200a97c64",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "FILTER col(\"pick_up\").is_in([Series]) FROM\n",
+ " AGGREGATE\n",
+ " \t[col(\"fare\").mean()] BY [col(\"pick_up\")] FROM\n",
+ " FILTER [(col(\"pick_up\")) == (col(\"drop_off\"))] FROM\n",
+ " Parquet SCAN [rides.parquet]\n",
+ " PROJECT */5 COLUMNS\n"
+ ]
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "rides = pl.scan_parquet(\"rides.parquet\")\n",
+ "\n",
+ "print(\n",
+ " (\n",
+ " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
+ " .group_by(pl.col(\"pick_up\"))\n",
+ " .agg(pl.col(\"fare\").mean())\n",
+ " .filter(\n",
+ " pl.col(\"pick_up\").is_in(\n",
+ " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
+ " )\n",
+ " )\n",
+ " ).explain(optimized=False)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "84df6a05-d10d-48bd-9216-d65a143c21dc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "rides = pl.scan_parquet(\"rides.parquet\")\n",
+ "\n",
+ "(\n",
+ " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
+ " .group_by(pl.col(\"pick_up\"))\n",
+ " .agg(pl.col(\"fare\").mean())\n",
+ " .filter(\n",
+ " pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"])\n",
+ " )\n",
+ ").show_graph(optimized=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "56ca5932-0c2e-45d9-8df7-8ae9da6da548",
+ "metadata": {},
+ "source": [
+ "## Investigating the Optimized Query Plan"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f773dc12-283e-4f6e-9d5b-162662421ab6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "rides = pl.scan_parquet(\"rides.parquet\")\n",
+ "\n",
+ "(\n",
+ " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
+ " .group_by(pl.col(\"pick_up\"))\n",
+ " .agg(pl.col(\"fare\").mean())\n",
+ " .filter(\n",
+ " pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"])\n",
+ " )\n",
+ ").show_graph(optimized=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "dd581a09-b243-4141-8fec-1f01eac567f8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Schema([('pick_up', String),\n",
+ " ('drop_off', String),\n",
+ " ('passengers', Int32),\n",
+ " ('distance', Int32),\n",
+ " ('fare', Int32)])"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rides.collect_schema()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b5baec65-1fd9-48ea-b315-54fd49d8330e",
+ "metadata": {},
+ "source": [
+ "# How LazyFrames Cope With Large Volumes of Data\n",
+ "## Using Streaming to Deal With Large Volumes\n",
+ "This code may crash your computer. Please ensure you save everything before running it. Also, you should not run this code within a notebook. Notebooks have their own memory configuration that is separate from your computer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "f8be883e-07e7-456c-9abf-4eb59d66220b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Code finished in 0.2538 seconds.'"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "rides = pl.scan_csv(\"2021_Yellow_Taxi_Trip_Data.csv\")\n",
+ "\n",
+ "print(\n",
+ " (\n",
+ " rides.group_by(pl.col(\"PULocationID\")).agg(\n",
+ " pl.col(\"total_amount\").sum()\n",
+ " )\n",
+ " ).collect(streaming=True)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3ca629d3-2abe-4ae9-b81e-be1f99e86a85",
+ "metadata": {},
+ "source": [
+ "## Deciding When Streaming Should Be Used"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "79f96a4c-87fa-4c02-88a5-3f2a3c201a8e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "STREAMING:\n",
+ " AGGREGATE\n",
+ " \t[col(\"drop_off\").mean()] BY [col(\"pick_up\")] FROM\n",
+ " Parquet SCAN [rides.parquet]\n",
+ " PROJECT 2/5 COLUMNS\n",
+ " SELECTION: [(col(\"pick_up\").is_in([Series])) & ([(col(\"pick_up\")) == (col(\"drop_off\"))])]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "rides = pl.scan_parquet(\"rides.parquet\")\n",
+ "\n",
+ "print(\n",
+ " (\n",
+ " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
+ " .group_by(pl.col(\"pick_up\"))\n",
+ " .agg(pl.col(\"drop_off\").mean())\n",
+ " .filter(\n",
+ " pl.col(\"pick_up\").is_in(\n",
+ " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
+ " )\n",
+ " )\n",
+ " ).explain(streaming=True, optimized=True)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "ec4a5be0-48a0-4657-b844-a2b2a53f2e27",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " WITH_COLUMNS:\n",
+ " [col(\"fare\").mean().over([col(\"pick_up\")]).alias(\"mean fare\")] \n",
+ " STREAMING:\n",
+ " Parquet SCAN [rides.parquet]\n",
+ " PROJECT */5 COLUMNS\n",
+ " SELECTION: [(col(\"pick_up\")) == (col(\"drop_off\"))]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "rides = pl.scan_parquet(\"rides.parquet\")\n",
+ "\n",
+ "print(\n",
+ " (\n",
+ " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\")).with_columns(\n",
+ " pl.col(\"fare\").mean().over(pl.col(\"pick_up\")).alias(\"mean fare\")\n",
+ " )\n",
+ " ).explain(streaming=True, optimized=True)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f3a9ef30-e20a-42d7-9bb6-2e5b225f40b8",
+ "metadata": {},
+ "source": [
+ "# How to Decide if LazyFrames Are Indeed Suitable"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "ca70f902-3aba-4459-ad9b-4560f1434d8f",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "AttributeError",
+ "evalue": "'LazyFrame' object has no attribute 'pivot'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[13], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolars\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpl\u001b[39;00m\n\u001b[0;32m 3\u001b[0m rides \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mscan_parquet(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrides.parquet\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 5\u001b[0m \u001b[43mrides\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m(\n\u001b[0;32m 6\u001b[0m on\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpick_up\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 7\u001b[0m index\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdrop_off\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 8\u001b[0m values\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassengers\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 9\u001b[0m aggregate_function\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 10\u001b[0m )\n",
+ "\u001b[1;31mAttributeError\u001b[0m: 'LazyFrame' object has no attribute 'pivot'"
+ ]
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "rides = pl.scan_parquet(\"rides.parquet\")\n",
+ "\n",
+ "rides.pivot(\n",
+ " on=\"pick_up\",\n",
+ " index=\"drop_off\",\n",
+ " values=\"passengers\",\n",
+ " aggregate_function=\"sum\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "40bbe881-9b18-4fca-a8aa-f65e6167f677",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (8, 9)| drop_off | Manhattan | Queens | Unknown | Brooklyn | N/A | Bronx | EWR | Staten Island |
|---|
| str | i32 | i32 | i32 | i32 | i32 | i32 | i32 | i32 |
| "Manhattan" | 3041648 | 256376 | 4686 | 9736 | 113 | 3749 | 14 | 11 |
| "Queens" | 111178 | 115640 | 629 | 3442 | 148 | 830 | 1 | 3 |
| "Brooklyn" | 68712 | 66116 | 333 | 18854 | 20 | 924 | 1 | 38 |
| "Bronx" | 12006 | 10396 | 59 | 862 | 23 | 1785 | null | null |
| "Unknown" | 9668 | 1016 | 7512 | 50 | 37 | 13 | 10 | null |
| "N/A" | 6312 | 17630 | 66 | 146 | 2974 | 82 | 25 | 5 |
| "Staten Island" | 346 | 718 | 2 | 90 | null | null | null | 45 |
| "EWR" | 12759 | 806 | 27 | 47 | 131 | 6 | 1034 | 3 |
"
+ ],
+ "text/plain": [
+ "shape: (8, 9)\n",
+ "┌───────────────┬───────────┬────────┬─────────┬───┬──────┬───────┬──────┬───────────────┐\n",
+ "│ drop_off ┆ Manhattan ┆ Queens ┆ Unknown ┆ … ┆ N/A ┆ Bronx ┆ EWR ┆ Staten Island │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i32 ┆ i32 ┆ i32 ┆ i32 │\n",
+ "╞═══════════════╪═══════════╪════════╪═════════╪═══╪══════╪═══════╪══════╪═══════════════╡\n",
+ "│ Manhattan ┆ 3041648 ┆ 256376 ┆ 4686 ┆ … ┆ 113 ┆ 3749 ┆ 14 ┆ 11 │\n",
+ "│ Queens ┆ 111178 ┆ 115640 ┆ 629 ┆ … ┆ 148 ┆ 830 ┆ 1 ┆ 3 │\n",
+ "│ Brooklyn ┆ 68712 ┆ 66116 ┆ 333 ┆ … ┆ 20 ┆ 924 ┆ 1 ┆ 38 │\n",
+ "│ Bronx ┆ 12006 ┆ 10396 ┆ 59 ┆ … ┆ 23 ┆ 1785 ┆ null ┆ null │\n",
+ "│ Unknown ┆ 9668 ┆ 1016 ┆ 7512 ┆ … ┆ 37 ┆ 13 ┆ 10 ┆ null │\n",
+ "│ N/A ┆ 6312 ┆ 17630 ┆ 66 ┆ … ┆ 2974 ┆ 82 ┆ 25 ┆ 5 │\n",
+ "│ Staten Island ┆ 346 ┆ 718 ┆ 2 ┆ … ┆ null ┆ null ┆ null ┆ 45 │\n",
+ "│ EWR ┆ 12759 ┆ 806 ┆ 27 ┆ … ┆ 131 ┆ 6 ┆ 1034 ┆ 3 │\n",
+ "└───────────────┴───────────┴────────┴─────────┴───┴──────┴───────┴──────┴───────────────┘"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " rides.collect().pivot(\n",
+ " on=\"pick_up\",\n",
+ " index=\"drop_off\",\n",
+ " values=\"passengers\",\n",
+ " aggregate_function=\"sum\",\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "d8797e95-1071-4b71-84a6-3c0fe1063cc5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (3, 4)| drop_off | Queens | Brooklyn | Bronx |
|---|
| str | i32 | i32 | i32 |
| "Brooklyn" | 66116 | 18854 | 924 |
| "Queens" | 115640 | 3442 | 830 |
| "Bronx" | 10396 | 862 | 1785 |
"
+ ],
+ "text/plain": [
+ "shape: (3, 4)\n",
+ "┌──────────┬────────┬──────────┬───────┐\n",
+ "│ drop_off ┆ Queens ┆ Brooklyn ┆ Bronx │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ i32 ┆ i32 ┆ i32 │\n",
+ "╞══════════╪════════╪══════════╪═══════╡\n",
+ "│ Brooklyn ┆ 66116 ┆ 18854 ┆ 924 │\n",
+ "│ Queens ┆ 115640 ┆ 3442 ┆ 830 │\n",
+ "│ Bronx ┆ 10396 ┆ 862 ┆ 1785 │\n",
+ "└──────────┴────────┴──────────┴───────┘"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " rides.filter(pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
+ " .filter(pl.col(\"drop_off\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
+ " .collect()\n",
+ " .pivot(\n",
+ " on=\"pick_up\",\n",
+ " index=\"drop_off\",\n",
+ " values=\"passengers\",\n",
+ " aggregate_function=\"sum\",\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "0f17622f-c69d-4b3c-8113-0a6aec912089",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (1, 4)| drop_off | Queens | Brooklyn | Bronx |
|---|
| str | i32 | i32 | i32 |
| "Queens" | 115640 | 18854 | 1785 |
"
+ ],
+ "text/plain": [
+ "shape: (1, 4)\n",
+ "┌──────────┬────────┬──────────┬───────┐\n",
+ "│ drop_off ┆ Queens ┆ Brooklyn ┆ Bronx │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ i32 ┆ i32 ┆ i32 │\n",
+ "╞══════════╪════════╪══════════╪═══════╡\n",
+ "│ Queens ┆ 115640 ┆ 18854 ┆ 1785 │\n",
+ "└──────────┴────────┴──────────┴───────┘"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " rides.filter(pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
+ " .filter(pl.col(\"drop_off\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
+ " .collect()\n",
+ " .pivot(\n",
+ " on=\"pick_up\",\n",
+ " index=\"drop_off\",\n",
+ " values=\"passengers\",\n",
+ " aggregate_function=\"sum\",\n",
+ " )\n",
+ " .lazy()\n",
+ " .select(pl.max(\"*\"))\n",
+ ").collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "eb0bdf0d-7547-40dd-8ad5-9410ec34b9ed",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['corr', 'drop_in_place', 'equals', 'estimated_size', 'extend', 'flags', 'fold', 'get_column', 'get_column_index', 'get_columns', 'glimpse', 'hash_rows', 'height', 'hstack', 'insert_column', 'is_duplicated', 'is_empty', 'is_unique', 'item', 'iter_columns', 'iter_rows', 'iter_slices', 'map_rows', 'max_horizontal', 'mean_horizontal', 'min_horizontal', 'n_chunks', 'n_unique', 'partition_by', 'pivot', 'plot', 'product', 'rechunk', 'replace_column', 'row', 'rows', 'rows_by_key', 'sample', 'shape', 'shrink_to_fit', 'style', 'sum_horizontal', 'to_arrow', 'to_dict', 'to_dicts', 'to_dummies', 'to_init_repr', 'to_jax', 'to_numpy', 'to_pandas', 'to_series', 'to_struct', 'to_torch', 'transpose', 'unstack', 'upsample', 'vstack', 'write_avro', 'write_clipboard', 'write_csv', 'write_database', 'write_delta', 'write_excel', 'write_ipc', 'write_ipc_stream', 'write_json', 'write_ndjson', 'write_parquet']\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_ops = set(x for x in dir(pl.DataFrame()) if not x.startswith(\"_\"))\n",
+ "lazy_ops = set(x for x in dir(pl.LazyFrame()) if not x.startswith(\"_\"))\n",
+ "print(sorted(df_ops - lazy_ops))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "98f141b2-3ed1-4dd0-91db-0573896153dc",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}