diff --git a/polars-lazyframe/README.md b/polars-lazyframe/README.md new file mode 100644 index 0000000000..5a4e876aa5 --- /dev/null +++ b/polars-lazyframe/README.md @@ -0,0 +1,11 @@ +# How to Work With Polars LazyFrames + +This folder contains completed notebooks and other files used in the Real Python tutorial [How to Work With Polars LazyFrames](https://realpython.com/how-to-work-with-polars-lazyframe/). + +**The following files are included:** + +- `tutorial_code.ipynb` is a Jupyter Notebook containing all the code used in the tutorial. +- `rides.parquet` contains taxi fare data used throughout the tutorial. [See below] +- `taxi_rides_nnGB.py` are a series of Python scripts that will generate a local copy of the `2021_Yellow_Taxi_Trip_Data.csv` in different sizes. This is used to illustrate data streaming and is provided to help demonstrate streaming. +- `programming_languages.csv` is a small file used to illustrate the conversion between a DataFrame and a LazyFrame. +- `dataframe_timer.py` and `lazyframe_timer.py` are two scripts used to compare the speed differences when using both a DataFrame and LazyFrame to perform an analysis. diff --git a/polars-lazyframe/dataframe_timer.py b/polars-lazyframe/dataframe_timer.py new file mode 100644 index 0000000000..17eef0c961 --- /dev/null +++ b/polars-lazyframe/dataframe_timer.py @@ -0,0 +1,22 @@ +import time + +import polars as pl + +start = time.perf_counter() + +for _ in range(10): + rides = pl.read_parquet("rides.parquet") + result = ( + rides.filter(pl.col("pick_up") == pl.col("drop_off")) + .group_by(pl.col("pick_up")) + .agg(pl.col("fare").mean()) + .filter( + pl.col("pick_up").is_in( + ["Brooklyn", "Bronx", "Queens", "Manhattan"] + ) + ) + ) + +end = time.perf_counter() + +f"Code finished in {(end - start)/10:0.4f} seconds." diff --git a/polars-lazyframe/lazyframe_timer.py b/polars-lazyframe/lazyframe_timer.py new file mode 100644 index 0000000000..643b5f9517 --- /dev/null +++ b/polars-lazyframe/lazyframe_timer.py @@ -0,0 +1,22 @@ +import time + +import polars as pl + +start = time.perf_counter() + +for _ in range(10): + rides = pl.scan_parquet("rides.parquet") + result = ( + rides.filter(pl.col("pick_up") == pl.col("drop_off")) + .group_by(pl.col("pick_up")) + .agg(pl.col("fare").mean()) + .filter( + pl.col("pick_up").is_in( + ["Brooklyn", "Bronx", "Queens", "Manhattan"] + ) + ) + ).collect() + +end = time.perf_counter() + +f"Code finished in {(end - start)/10:0.4f} seconds." diff --git a/polars-lazyframe/programming_languages.csv b/polars-lazyframe/programming_languages.csv new file mode 100644 index 0000000000..9adf04219b --- /dev/null +++ b/polars-lazyframe/programming_languages.csv @@ -0,0 +1,6 @@ +language_id,language,creator,year +0,Pascal,Niklaus Wirth,1970 +1,C,Dennis Ritchie,1973 +2,C++,Bjarne Stroustrup,1985 +3,Python,Guido van Rossum,1991 +4,Java,James Gosling,1995 diff --git a/polars-lazyframe/rides.parquet b/polars-lazyframe/rides.parquet new file mode 100644 index 0000000000..7bf6890b90 Binary files /dev/null and b/polars-lazyframe/rides.parquet differ diff --git a/polars-lazyframe/taxi_rides_16GB.py b/polars-lazyframe/taxi_rides_16GB.py new file mode 100644 index 0000000000..a2fac947fb --- /dev/null +++ b/polars-lazyframe/taxi_rides_16GB.py @@ -0,0 +1,87 @@ +import csv +import datetime +import random + +random.seed(10) + + +def generate_pu(): + lower_bound = datetime.datetime( + year=2021, month=1, day=1, hour=0, minute=0, second=0 + ) + upper_bound = datetime.datetime( + year=2021, month=12, day=31, hour=23, minute=59, second=59 + ) + random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound + return random_pickup.replace(microsecond=0) + + +def generate_do(pick_up): + random_dropoff = pick_up + datetime.timedelta( + minutes=random.randint(10, 120), seconds=random.randint(0, 59) + ) + return random_dropoff + + +def generate_choice(*kwargs): + return random.choice(kwargs) + + +def generate_file(): + with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile: + header = ( + "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count," + "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID," + "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + "improvement_surcharge,total_amount,congestion_surcharge,airport_fee" + ).split(",") + + writer = csv.writer(csvfile) + writer.writerow(header) + + for _ in range(170_000_000): + fare_amount = round(random.uniform(5.0, 100.0), 2) + extra = generate_choice(0, 0.5, 3) + mta_tax = generate_choice(0, 0.5) + tip_amount = round(random.uniform(0.0, 20.0), 2) + tolls_amount = generate_choice(0, 6.12) + improvement_surcharge = 0.3 + congestion_surcharge = 2.5 + total_amount = round( + fare_amount + + extra + + mta_tax + + tip_amount + + tolls_amount + + improvement_surcharge + + congestion_surcharge, + 2, + ) + + pick_up = generate_pu() + + ride = [ + random.randint(0, 6), # VendorID + pick_up, # tpep_pickup_datetime + generate_do(pick_up), # tpep_dropoff_datetime + random.randint(0, 5), # passenger_count + random.randint(0, 15), # trip_distance + random.randint(0, 6), # RatecodeID + generate_choice("Y", "N"), # store_and_fwd_flag + random.randint(1, 265), # PULocationID + random.randint(1, 265), # DOLocationID + random.randint(0, 5), # payment_type + fare_amount, # fare_amount + extra, # extra + mta_tax, # mta_tax + tip_amount, # tip_amount + tolls_amount, # tolls_amount + improvement_surcharge, # improvement_surcharge + total_amount, # total_amount + congestion_surcharge, # congestion_surcharge + ] + + writer.writerow(ride) + + +generate_file() diff --git a/polars-lazyframe/taxi_rides_24GB.py b/polars-lazyframe/taxi_rides_24GB.py new file mode 100644 index 0000000000..95efb7fc41 --- /dev/null +++ b/polars-lazyframe/taxi_rides_24GB.py @@ -0,0 +1,87 @@ +import csv +import datetime +import random + +random.seed(10) + + +def generate_pu(): + lower_bound = datetime.datetime( + year=2021, month=1, day=1, hour=0, minute=0, second=0 + ) + upper_bound = datetime.datetime( + year=2021, month=12, day=31, hour=23, minute=59, second=59 + ) + random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound + return random_pickup.replace(microsecond=0) + + +def generate_do(pick_up): + random_dropoff = pick_up + datetime.timedelta( + minutes=random.randint(10, 120), seconds=random.randint(0, 59) + ) + return random_dropoff + + +def generate_choice(*kwargs): + return random.choice(kwargs) + + +def generate_file(): + with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile: + header = ( + "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count," + "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID," + "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + "improvement_surcharge,total_amount,congestion_surcharge,airport_fee" + ).split(",") + + writer = csv.writer(csvfile) + writer.writerow(header) + + for _ in range(255_000_000): + fare_amount = round(random.uniform(5.0, 100.0), 2) + extra = generate_choice(0, 0.5, 3) + mta_tax = generate_choice(0, 0.5) + tip_amount = round(random.uniform(0.0, 20.0), 2) + tolls_amount = generate_choice(0, 6.12) + improvement_surcharge = 0.3 + congestion_surcharge = 2.5 + total_amount = round( + fare_amount + + extra + + mta_tax + + tip_amount + + tolls_amount + + improvement_surcharge + + congestion_surcharge, + 2, + ) + + pick_up = generate_pu() + + ride = [ + random.randint(0, 6), # VendorID + pick_up, # tpep_pickup_datetime + generate_do(pick_up), # tpep_dropoff_datetime + random.randint(0, 5), # passenger_count + random.randint(0, 15), # trip_distance + random.randint(0, 6), # RatecodeID + generate_choice("Y", "N"), # store_and_fwd_flag + random.randint(1, 265), # PULocationID + random.randint(1, 265), # DOLocationID + random.randint(0, 5), # payment_type + fare_amount, # fare_amount + extra, # extra + mta_tax, # mta_tax + tip_amount, # tip_amount + tolls_amount, # tolls_amount + improvement_surcharge, # improvement_surcharge + total_amount, # total_amount + congestion_surcharge, # congestion_surcharge + ] + + writer.writerow(ride) + + +generate_file() diff --git a/polars-lazyframe/taxi_rides_32GB.py b/polars-lazyframe/taxi_rides_32GB.py new file mode 100644 index 0000000000..9d4f13ed02 --- /dev/null +++ b/polars-lazyframe/taxi_rides_32GB.py @@ -0,0 +1,87 @@ +import csv +import datetime +import random + +random.seed(10) + + +def generate_pu(): + lower_bound = datetime.datetime( + year=2021, month=1, day=1, hour=0, minute=0, second=0 + ) + upper_bound = datetime.datetime( + year=2021, month=12, day=31, hour=23, minute=59, second=59 + ) + random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound + return random_pickup.replace(microsecond=0) + + +def generate_do(pick_up): + random_dropoff = pick_up + datetime.timedelta( + minutes=random.randint(10, 120), seconds=random.randint(0, 59) + ) + return random_dropoff + + +def generate_choice(*kwargs): + return random.choice(kwargs) + + +def generate_file(): + with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile: + header = ( + "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count," + "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID," + "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + "improvement_surcharge,total_amount,congestion_surcharge,airport_fee" + ).split(",") + + writer = csv.writer(csvfile) + writer.writerow(header) + + for _ in range(340_000_000): + fare_amount = round(random.uniform(5.0, 100.0), 2) + extra = generate_choice(0, 0.5, 3) + mta_tax = generate_choice(0, 0.5) + tip_amount = round(random.uniform(0.0, 20.0), 2) + tolls_amount = generate_choice(0, 6.12) + improvement_surcharge = 0.3 + congestion_surcharge = 2.5 + total_amount = round( + fare_amount + + extra + + mta_tax + + tip_amount + + tolls_amount + + improvement_surcharge + + congestion_surcharge, + 2, + ) + + pick_up = generate_pu() + + ride = [ + random.randint(0, 6), # VendorID + pick_up, # tpep_pickup_datetime + generate_do(pick_up), # tpep_dropoff_datetime + random.randint(0, 5), # passenger_count + random.randint(0, 15), # trip_distance + random.randint(0, 6), # RatecodeID + generate_choice("Y", "N"), # store_and_fwd_flag + random.randint(1, 265), # PULocationID + random.randint(1, 265), # DOLocationID + random.randint(0, 5), # payment_type + fare_amount, # fare_amount + extra, # extra + mta_tax, # mta_tax + tip_amount, # tip_amount + tolls_amount, # tolls_amount + improvement_surcharge, # improvement_surcharge + total_amount, # total_amount + congestion_surcharge, # congestion_surcharge + ] + + writer.writerow(ride) + + +generate_file() diff --git a/polars-lazyframe/taxi_rides_48GB.py b/polars-lazyframe/taxi_rides_48GB.py new file mode 100644 index 0000000000..b52ba09563 --- /dev/null +++ b/polars-lazyframe/taxi_rides_48GB.py @@ -0,0 +1,87 @@ +import csv +import datetime +import random + +random.seed(10) + + +def generate_pu(): + lower_bound = datetime.datetime( + year=2021, month=1, day=1, hour=0, minute=0, second=0 + ) + upper_bound = datetime.datetime( + year=2021, month=12, day=31, hour=23, minute=59, second=59 + ) + random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound + return random_pickup.replace(microsecond=0) + + +def generate_do(pick_up): + random_dropoff = pick_up + datetime.timedelta( + minutes=random.randint(10, 120), seconds=random.randint(0, 59) + ) + return random_dropoff + + +def generate_choice(*kwargs): + return random.choice(kwargs) + + +def generate_file(): + with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile: + header = ( + "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count," + "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID," + "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + "improvement_surcharge,total_amount,congestion_surcharge,airport_fee" + ).split(",") + + writer = csv.writer(csvfile) + writer.writerow(header) + + for _ in range(510_000_000): + fare_amount = round(random.uniform(5.0, 100.0), 2) + extra = generate_choice(0, 0.5, 3) + mta_tax = generate_choice(0, 0.5) + tip_amount = round(random.uniform(0.0, 20.0), 2) + tolls_amount = generate_choice(0, 6.12) + improvement_surcharge = 0.3 + congestion_surcharge = 2.5 + total_amount = round( + fare_amount + + extra + + mta_tax + + tip_amount + + tolls_amount + + improvement_surcharge + + congestion_surcharge, + 2, + ) + + pick_up = generate_pu() + + ride = [ + random.randint(0, 6), # VendorID + pick_up, # tpep_pickup_datetime + generate_do(pick_up), # tpep_dropoff_datetime + random.randint(0, 5), # passenger_count + random.randint(0, 15), # trip_distance + random.randint(0, 6), # RatecodeID + generate_choice("Y", "N"), # store_and_fwd_flag + random.randint(1, 265), # PULocationID + random.randint(1, 265), # DOLocationID + random.randint(0, 5), # payment_type + fare_amount, # fare_amount + extra, # extra + mta_tax, # mta_tax + tip_amount, # tip_amount + tolls_amount, # tolls_amount + improvement_surcharge, # improvement_surcharge + total_amount, # total_amount + congestion_surcharge, # congestion_surcharge + ] + + writer.writerow(ride) + + +generate_file() diff --git a/polars-lazyframe/taxi_rides_64GB.py b/polars-lazyframe/taxi_rides_64GB.py new file mode 100644 index 0000000000..f625681269 --- /dev/null +++ b/polars-lazyframe/taxi_rides_64GB.py @@ -0,0 +1,87 @@ +import csv +import datetime +import random + +random.seed(10) + + +def generate_pu(): + lower_bound = datetime.datetime( + year=2021, month=1, day=1, hour=0, minute=0, second=0 + ) + upper_bound = datetime.datetime( + year=2021, month=12, day=31, hour=23, minute=59, second=59 + ) + random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound + return random_pickup.replace(microsecond=0) + + +def generate_do(pick_up): + random_dropoff = pick_up + datetime.timedelta( + minutes=random.randint(10, 120), seconds=random.randint(0, 59) + ) + return random_dropoff + + +def generate_choice(*kwargs): + return random.choice(kwargs) + + +def generate_file(): + with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile: + header = ( + "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count," + "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID," + "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + "improvement_surcharge,total_amount,congestion_surcharge,airport_fee" + ).split(",") + + writer = csv.writer(csvfile) + writer.writerow(header) + + for _ in range(690_000_000): + fare_amount = round(random.uniform(5.0, 100.0), 2) + extra = generate_choice(0, 0.5, 3) + mta_tax = generate_choice(0, 0.5) + tip_amount = round(random.uniform(0.0, 20.0), 2) + tolls_amount = generate_choice(0, 6.12) + improvement_surcharge = 0.3 + congestion_surcharge = 2.5 + total_amount = round( + fare_amount + + extra + + mta_tax + + tip_amount + + tolls_amount + + improvement_surcharge + + congestion_surcharge, + 2, + ) + + pick_up = generate_pu() + + ride = [ + random.randint(0, 6), # VendorID + pick_up, # tpep_pickup_datetime + generate_do(pick_up), # tpep_dropoff_datetime + random.randint(0, 5), # passenger_count + random.randint(0, 15), # trip_distance + random.randint(0, 6), # RatecodeID + generate_choice("Y", "N"), # store_and_fwd_flag + random.randint(1, 265), # PULocationID + random.randint(1, 265), # DOLocationID + random.randint(0, 5), # payment_type + fare_amount, # fare_amount + extra, # extra + mta_tax, # mta_tax + tip_amount, # tip_amount + tolls_amount, # tolls_amount + improvement_surcharge, # improvement_surcharge + total_amount, # total_amount + congestion_surcharge, # congestion_surcharge + ] + + writer.writerow(ride) + + +generate_file() diff --git a/polars-lazyframe/taxi_rides_80GB.py b/polars-lazyframe/taxi_rides_80GB.py new file mode 100644 index 0000000000..4d9964e5f8 --- /dev/null +++ b/polars-lazyframe/taxi_rides_80GB.py @@ -0,0 +1,87 @@ +import csv +import datetime +import random + +random.seed(10) + + +def generate_pu(): + lower_bound = datetime.datetime( + year=2021, month=1, day=1, hour=0, minute=0, second=0 + ) + upper_bound = datetime.datetime( + year=2021, month=12, day=31, hour=23, minute=59, second=59 + ) + random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound + return random_pickup.replace(microsecond=0) + + +def generate_do(pick_up): + random_dropoff = pick_up + datetime.timedelta( + minutes=random.randint(10, 120), seconds=random.randint(0, 59) + ) + return random_dropoff + + +def generate_choice(*kwargs): + return random.choice(kwargs) + + +def generate_file(): + with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile: + header = ( + "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count," + "trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID," + "payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + "improvement_surcharge,total_amount,congestion_surcharge,airport_fee" + ).split(",") + + writer = csv.writer(csvfile) + writer.writerow(header) + + for _ in range(860_000_000): + fare_amount = round(random.uniform(5.0, 100.0), 2) + extra = generate_choice(0, 0.5, 3) + mta_tax = generate_choice(0, 0.5) + tip_amount = round(random.uniform(0.0, 20.0), 2) + tolls_amount = generate_choice(0, 6.12) + improvement_surcharge = 0.3 + congestion_surcharge = 2.5 + total_amount = round( + fare_amount + + extra + + mta_tax + + tip_amount + + tolls_amount + + improvement_surcharge + + congestion_surcharge, + 2, + ) + + pick_up = generate_pu() + + ride = [ + random.randint(0, 6), # VendorID + pick_up, # tpep_pickup_datetime + generate_do(pick_up), # tpep_dropoff_datetime + random.randint(0, 5), # passenger_count + random.randint(0, 15), # trip_distance + random.randint(0, 6), # RatecodeID + generate_choice("Y", "N"), # store_and_fwd_flag + random.randint(1, 265), # PULocationID + random.randint(1, 265), # DOLocationID + random.randint(0, 5), # payment_type + fare_amount, # fare_amount + extra, # extra + mta_tax, # mta_tax + tip_amount, # tip_amount + tolls_amount, # tolls_amount + improvement_surcharge, # improvement_surcharge + total_amount, # total_amount + congestion_surcharge, # congestion_surcharge + ] + + writer.writerow(ride) + + +generate_file() diff --git a/polars-lazyframe/tutorial_code.ipynb b/polars-lazyframe/tutorial_code.ipynb new file mode 100644 index 0000000000..cf68dfd493 --- /dev/null +++ b/polars-lazyframe/tutorial_code.ipynb @@ -0,0 +1,896 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c0a043d2-2fe3-40bd-bf61-b6e12a218a42", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554f934e-b66f-4633-bc93-8a51edaa25b0", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install polars\n", + "!python -m pip install matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ec44a544-7828-461e-b735-536f3fe6fd16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3_076_903, 5)
pick_updrop_offpassengersdistancefare
strstri32i32i32
"Manhattan""Manhattan"1324
"Queens""Manhattan"11975
"Manhattan""Queens"1116
"Queens""Manhattan"0960
"Queens""Manhattan"11790
"Manhattan""Manhattan"null527
"Manhattan""Manhattan"null426
"Queens""Brooklyn"null426
"Manhattan""Manhattan"null324
"Manhattan""Manhattan"null835
" + ], + "text/plain": [ + "shape: (3_076_903, 5)\n", + "┌───────────┬───────────┬────────────┬──────────┬──────┐\n", + "│ pick_up ┆ drop_off ┆ passengers ┆ distance ┆ fare │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ i32 ┆ i32 ┆ i32 │\n", + "╞═══════════╪═══════════╪════════════╪══════════╪══════╡\n", + "│ Manhattan ┆ Manhattan ┆ 1 ┆ 3 ┆ 24 │\n", + "│ Queens ┆ Manhattan ┆ 1 ┆ 19 ┆ 75 │\n", + "│ Manhattan ┆ Queens ┆ 1 ┆ 1 ┆ 16 │\n", + "│ Queens ┆ Manhattan ┆ 0 ┆ 9 ┆ 60 │\n", + "│ Queens ┆ Manhattan ┆ 1 ┆ 17 ┆ 90 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ Manhattan ┆ Manhattan ┆ null ┆ 5 ┆ 27 │\n", + "│ Manhattan ┆ Manhattan ┆ null ┆ 4 ┆ 26 │\n", + "│ Queens ┆ Brooklyn ┆ null ┆ 4 ┆ 26 │\n", + "│ Manhattan ┆ Manhattan ┆ null ┆ 3 ┆ 24 │\n", + "│ Manhattan ┆ Manhattan ┆ null ┆ 8 ┆ 35 │\n", + "└───────────┴───────────┴────────────┴──────────┴──────┘" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "\n", + "rides = pl.scan_parquet(\"rides.parquet\")\n", + "\n", + "rides.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "c4665acc-d207-4a4c-bdc6-10f436e32a99", + "metadata": {}, + "source": [ + "# How to Create a Polars LazyFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "784b7616-643a-46bc-b449-0fbc5f390178", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 4)
language_idlanguagecreatoryear
i64strstri64
0"Pascal""Niklaus Wirth"1970
1"C""Dennis Ritchie"1973
2"C++""Bjarne Stroustrup"1985
3"Python""Guido van Rossum"1991
4"Java""James Gosling"1995
" + ], + "text/plain": [ + "shape: (5, 4)\n", + "┌─────────────┬──────────┬───────────────────┬──────┐\n", + "│ language_id ┆ language ┆ creator ┆ year │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ str ┆ i64 │\n", + "╞═════════════╪══════════╪═══════════════════╪══════╡\n", + "│ 0 ┆ Pascal ┆ Niklaus Wirth ┆ 1970 │\n", + "│ 1 ┆ C ┆ Dennis Ritchie ┆ 1973 │\n", + "│ 2 ┆ C++ ┆ Bjarne Stroustrup ┆ 1985 │\n", + "│ 3 ┆ Python ┆ Guido van Rossum ┆ 1991 │\n", + "│ 4 ┆ Java ┆ James Gosling ┆ 1995 │\n", + "└─────────────┴──────────┴───────────────────┴──────┘" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "\n", + "programming_languages = {\n", + " \"language_id\": range(5),\n", + " \"language\": [\"Pascal\", \"C\", \"C++\", \"Python\", \"Java\"],\n", + " \"creator\": [\n", + " \"Niklaus Wirth\",\n", + " \"Dennis Ritchie\",\n", + " \"Bjarne Stroustrup\",\n", + " \"Guido van Rossum\",\n", + " \"James Gosling\",\n", + " ],\n", + " \"year\": [1970, 1973, 1985, 1991, 1995],\n", + "}\n", + "\n", + "lf = pl.LazyFrame(programming_languages)\n", + "lf.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "168d4db9-87c0-4ce2-aaab-35f128a03788", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 4)
language_idlanguagecreatoryear
i32strstri32
0"Pascal""Niklaus Wirth"1970
1"C""Dennis Ritchie"1973
2"C++""Bjarne Stroustrup"1985
3"Python""Guido van Rossum"1991
4"Java""James Gosling"1995
" + ], + "text/plain": [ + "shape: (5, 4)\n", + "┌─────────────┬──────────┬───────────────────┬──────┐\n", + "│ language_id ┆ language ┆ creator ┆ year │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i32 ┆ str ┆ str ┆ i32 │\n", + "╞═════════════╪══════════╪═══════════════════╪══════╡\n", + "│ 0 ┆ Pascal ┆ Niklaus Wirth ┆ 1970 │\n", + "│ 1 ┆ C ┆ Dennis Ritchie ┆ 1973 │\n", + "│ 2 ┆ C++ ┆ Bjarne Stroustrup ┆ 1985 │\n", + "│ 3 ┆ Python ┆ Guido van Rossum ┆ 1991 │\n", + "│ 4 ┆ Java ┆ James Gosling ┆ 1995 │\n", + "└─────────────┴──────────┴───────────────────┴──────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "\n", + "programming_languages = {\n", + " \"language_id\": range(5),\n", + " \"language\": [\"Pascal\", \"C\", \"C++\", \"Python\", \"Java\"],\n", + " \"creator\": [\n", + " \"Niklaus Wirth\",\n", + " \"Dennis Ritchie\",\n", + " \"Bjarne Stroustrup\",\n", + " \"Guido van Rossum\",\n", + " \"James Gosling\",\n", + " ],\n", + " \"year\": [1970, 1973, 1985, 1991, 1995],\n", + "}\n", + "\n", + "d_types = {\n", + " \"language_id\": pl.Int32,\n", + " \"language\": pl.String,\n", + " \"creator\": pl.String,\n", + " \"year\": pl.Int32,\n", + "}\n", + "\n", + "lf = pl.LazyFrame(programming_languages, schema=d_types)\n", + "lf.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8ee12257-f59c-4166-ab6e-3802a5a4d9b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Schema([('language_id', Int32),\n", + " ('language', String),\n", + " ('creator', String),\n", + " ('year', Int32)])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "languages.collect_schema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b4ea00-1afd-4220-92b8-2a99f81b5c30", + "metadata": {}, + "outputs": [], + "source": [ + "languages_df = pl.read_csv(\"programming_languages.csv\")\n", + "type(languages_df)\n", + "\n", + "\n", + "languages_lf = languages_df.lazy()\n", + "type(languages_lf)" + ] + }, + { + "cell_type": "markdown", + "id": "bb75309e-777b-48ec-b340-f9233586741c", + "metadata": {}, + "source": [ + "# How to Monitor LazyFrame Efficiency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06aa3446-86ca-43f7-b88c-1ff8d53b0597", + "metadata": {}, + "outputs": [], + "source": [ + "# dataframe_timer.py\n", + "\n", + "import polars as pl\n", + "import time\n", + "\n", + "start = time.perf_counter()\n", + "\n", + "for _ in range(10):\n", + " rides = pl.read_parquet(\"rides.parquet\")\n", + " result = (\n", + " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n", + " .group_by(pl.col(\"pick_up\"))\n", + " .agg(pl.col(\"fare\").mean())\n", + " .filter(\n", + " pl.col(\"pick_up\").is_in(\n", + " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n", + " )\n", + " )\n", + " )\n", + "\n", + "end = time.perf_counter()\n", + "\n", + "f\"Code finished in {(end - start)/10:0.4f} seconds.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62c57ae3-7eae-4f61-948a-50e31df28eb9", + "metadata": {}, + "outputs": [], + "source": [ + "# lazyframe_timer.py\n", + "\n", + "import polars as pl\n", + "import time\n", + "\n", + "start = time.perf_counter()\n", + "\n", + "for _ in range(10):\n", + " rides = pl.scan_parquet(\"rides.parquet\")\n", + " result = (\n", + " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n", + " .group_by(pl.col(\"pick_up\"))\n", + " .agg(pl.col(\"fare\").mean())\n", + " .filter(\n", + " pl.col(\"pick_up\").is_in(\n", + " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n", + " )\n", + " )\n", + " ).collect()\n", + "\n", + "end = time.perf_counter()\n", + "\n", + "f\"Code finished in {(end - start)/10:0.4f} seconds.\"" + ] + }, + { + "cell_type": "markdown", + "id": "723df3df-44e0-40d6-a3d6-22797c37b994", + "metadata": {}, + "source": [ + "# How LazyFrames Achieve Efficiency\n", + "## Investigating the Sub-Optimized Query Plan" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "48a7ae3f-d906-4c96-b8c1-7bf200a97c64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FILTER col(\"pick_up\").is_in([Series]) FROM\n", + " AGGREGATE\n", + " \t[col(\"fare\").mean()] BY [col(\"pick_up\")] FROM\n", + " FILTER [(col(\"pick_up\")) == (col(\"drop_off\"))] FROM\n", + " Parquet SCAN [rides.parquet]\n", + " PROJECT */5 COLUMNS\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "\n", + "rides = pl.scan_parquet(\"rides.parquet\")\n", + "\n", + "print(\n", + " (\n", + " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n", + " .group_by(pl.col(\"pick_up\"))\n", + " .agg(pl.col(\"fare\").mean())\n", + " .filter(\n", + " pl.col(\"pick_up\").is_in(\n", + " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n", + " )\n", + " )\n", + " ).explain(optimized=False)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "84df6a05-d10d-48bd-9216-d65a143c21dc", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "polars_query\n", + "\n", + "\n", + "\n", + "p1\n", + "\n", + "FILTER BY col(\"pick_up\").is_in([Series])\n", + "\n", + "\n", + "\n", + "p2\n", + "\n", + "AGG [col(\"fare\").mean()]\n", + "BY\n", + "[col(\"pick_up\")]\n", + "\n", + "\n", + "\n", + "p1--p2\n", + "\n", + "\n", + "\n", + "\n", + "p3\n", + "\n", + "FILTER BY [(col(\"pick_up\")) == (col(\"drop_off\"))]\n", + "\n", + "\n", + "\n", + "p2--p3\n", + "\n", + "\n", + "\n", + "\n", + "p4\n", + "\n", + "Parquet SCAN [rides.parquet]\n", + "π */5;\n", + "\n", + "\n", + "\n", + "p3--p4\n", + "\n", + "\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import polars as pl\n", + "\n", + "rides = pl.scan_parquet(\"rides.parquet\")\n", + "\n", + "(\n", + " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n", + " .group_by(pl.col(\"pick_up\"))\n", + " .agg(pl.col(\"fare\").mean())\n", + " .filter(\n", + " pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"])\n", + " )\n", + ").show_graph(optimized=False)" + ] + }, + { + "cell_type": "markdown", + "id": "56ca5932-0c2e-45d9-8df7-8ae9da6da548", + "metadata": {}, + "source": [ + "## Investigating the Optimized Query Plan" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f773dc12-283e-4f6e-9d5b-162662421ab6", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "polars_query\n", + "\n", + "\n", + "\n", + "p1\n", + "\n", + "AGG [col(\"fare\").mean()]\n", + "BY\n", + "[col(\"pick_up\")]\n", + "\n", + "\n", + "\n", + "p2\n", + "\n", + "Parquet SCAN [rides.parquet]\n", + "π 3/5;\n", + "σ [(col(\"pick_up\").is_in([Series])) & ([(col(\"pick_up\")) == (col(\"drop_off\"))])]\n", + "\n", + "\n", + "\n", + "p1--p2\n", + "\n", + "\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rides = pl.scan_parquet(\"rides.parquet\")\n", + "\n", + "(\n", + " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n", + " .group_by(pl.col(\"pick_up\"))\n", + " .agg(pl.col(\"fare\").mean())\n", + " .filter(\n", + " pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"])\n", + " )\n", + ").show_graph(optimized=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "dd581a09-b243-4141-8fec-1f01eac567f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Schema([('pick_up', String),\n", + " ('drop_off', String),\n", + " ('passengers', Int32),\n", + " ('distance', Int32),\n", + " ('fare', Int32)])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rides.collect_schema()" + ] + }, + { + "cell_type": "markdown", + "id": "b5baec65-1fd9-48ea-b315-54fd49d8330e", + "metadata": {}, + "source": [ + "# How LazyFrames Cope With Large Volumes of Data\n", + "## Using Streaming to Deal With Large Volumes\n", + "This code may crash your computer. Please ensure you save everything before running it. Also, you should not run this code within a notebook. Notebooks have their own memory configuration that is separate from your computer." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f8be883e-07e7-456c-9abf-4eb59d66220b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Code finished in 0.2538 seconds.'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "\n", + "rides = pl.scan_csv(\"2021_Yellow_Taxi_Trip_Data.csv\")\n", + "\n", + "print(\n", + " (\n", + " rides.group_by(pl.col(\"PULocationID\")).agg(\n", + " pl.col(\"total_amount\").sum()\n", + " )\n", + " ).collect(streaming=True)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3ca629d3-2abe-4ae9-b81e-be1f99e86a85", + "metadata": {}, + "source": [ + "## Deciding When Streaming Should Be Used" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "79f96a4c-87fa-4c02-88a5-3f2a3c201a8e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "STREAMING:\n", + " AGGREGATE\n", + " \t[col(\"drop_off\").mean()] BY [col(\"pick_up\")] FROM\n", + " Parquet SCAN [rides.parquet]\n", + " PROJECT 2/5 COLUMNS\n", + " SELECTION: [(col(\"pick_up\").is_in([Series])) & ([(col(\"pick_up\")) == (col(\"drop_off\"))])]\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "\n", + "rides = pl.scan_parquet(\"rides.parquet\")\n", + "\n", + "print(\n", + " (\n", + " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n", + " .group_by(pl.col(\"pick_up\"))\n", + " .agg(pl.col(\"drop_off\").mean())\n", + " .filter(\n", + " pl.col(\"pick_up\").is_in(\n", + " [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n", + " )\n", + " )\n", + " ).explain(streaming=True, optimized=True)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ec4a5be0-48a0-4657-b844-a2b2a53f2e27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " WITH_COLUMNS:\n", + " [col(\"fare\").mean().over([col(\"pick_up\")]).alias(\"mean fare\")] \n", + " STREAMING:\n", + " Parquet SCAN [rides.parquet]\n", + " PROJECT */5 COLUMNS\n", + " SELECTION: [(col(\"pick_up\")) == (col(\"drop_off\"))]\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "\n", + "rides = pl.scan_parquet(\"rides.parquet\")\n", + "\n", + "print(\n", + " (\n", + " rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\")).with_columns(\n", + " pl.col(\"fare\").mean().over(pl.col(\"pick_up\")).alias(\"mean fare\")\n", + " )\n", + " ).explain(streaming=True, optimized=True)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f3a9ef30-e20a-42d7-9bb6-2e5b225f40b8", + "metadata": {}, + "source": [ + "# How to Decide if LazyFrames Are Indeed Suitable" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ca70f902-3aba-4459-ad9b-4560f1434d8f", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'LazyFrame' object has no attribute 'pivot'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[13], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolars\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpl\u001b[39;00m\n\u001b[0;32m 3\u001b[0m rides \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mscan_parquet(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrides.parquet\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 5\u001b[0m \u001b[43mrides\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m(\n\u001b[0;32m 6\u001b[0m on\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpick_up\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 7\u001b[0m index\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdrop_off\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 8\u001b[0m values\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassengers\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 9\u001b[0m aggregate_function\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 10\u001b[0m )\n", + "\u001b[1;31mAttributeError\u001b[0m: 'LazyFrame' object has no attribute 'pivot'" + ] + } + ], + "source": [ + "import polars as pl\n", + "\n", + "rides = pl.scan_parquet(\"rides.parquet\")\n", + "\n", + "rides.pivot(\n", + " on=\"pick_up\",\n", + " index=\"drop_off\",\n", + " values=\"passengers\",\n", + " aggregate_function=\"sum\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "40bbe881-9b18-4fca-a8aa-f65e6167f677", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (8, 9)
drop_offManhattanQueensUnknownBrooklynN/ABronxEWRStaten Island
stri32i32i32i32i32i32i32i32
"Manhattan"30416482563764686973611337491411
"Queens"111178115640629344214883013
"Brooklyn"68712661163331885420924138
"Bronx"120061039659862231785nullnull
"Unknown"96681016751250371310null
"N/A"63121763066146297482255
"Staten Island"346718290nullnullnull45
"EWR"127598062747131610343
" + ], + "text/plain": [ + "shape: (8, 9)\n", + "┌───────────────┬───────────┬────────┬─────────┬───┬──────┬───────┬──────┬───────────────┐\n", + "│ drop_off ┆ Manhattan ┆ Queens ┆ Unknown ┆ … ┆ N/A ┆ Bronx ┆ EWR ┆ Staten Island │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i32 ┆ i32 ┆ i32 ┆ i32 │\n", + "╞═══════════════╪═══════════╪════════╪═════════╪═══╪══════╪═══════╪══════╪═══════════════╡\n", + "│ Manhattan ┆ 3041648 ┆ 256376 ┆ 4686 ┆ … ┆ 113 ┆ 3749 ┆ 14 ┆ 11 │\n", + "│ Queens ┆ 111178 ┆ 115640 ┆ 629 ┆ … ┆ 148 ┆ 830 ┆ 1 ┆ 3 │\n", + "│ Brooklyn ┆ 68712 ┆ 66116 ┆ 333 ┆ … ┆ 20 ┆ 924 ┆ 1 ┆ 38 │\n", + "│ Bronx ┆ 12006 ┆ 10396 ┆ 59 ┆ … ┆ 23 ┆ 1785 ┆ null ┆ null │\n", + "│ Unknown ┆ 9668 ┆ 1016 ┆ 7512 ┆ … ┆ 37 ┆ 13 ┆ 10 ┆ null │\n", + "│ N/A ┆ 6312 ┆ 17630 ┆ 66 ┆ … ┆ 2974 ┆ 82 ┆ 25 ┆ 5 │\n", + "│ Staten Island ┆ 346 ┆ 718 ┆ 2 ┆ … ┆ null ┆ null ┆ null ┆ 45 │\n", + "│ EWR ┆ 12759 ┆ 806 ┆ 27 ┆ … ┆ 131 ┆ 6 ┆ 1034 ┆ 3 │\n", + "└───────────────┴───────────┴────────┴─────────┴───┴──────┴───────┴──────┴───────────────┘" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " rides.collect().pivot(\n", + " on=\"pick_up\",\n", + " index=\"drop_off\",\n", + " values=\"passengers\",\n", + " aggregate_function=\"sum\",\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d8797e95-1071-4b71-84a6-3c0fe1063cc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 4)
drop_offQueensBrooklynBronx
stri32i32i32
"Brooklyn"6611618854924
"Queens"1156403442830
"Bronx"103968621785
" + ], + "text/plain": [ + "shape: (3, 4)\n", + "┌──────────┬────────┬──────────┬───────┐\n", + "│ drop_off ┆ Queens ┆ Brooklyn ┆ Bronx │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ i32 ┆ i32 ┆ i32 │\n", + "╞══════════╪════════╪══════════╪═══════╡\n", + "│ Brooklyn ┆ 66116 ┆ 18854 ┆ 924 │\n", + "│ Queens ┆ 115640 ┆ 3442 ┆ 830 │\n", + "│ Bronx ┆ 10396 ┆ 862 ┆ 1785 │\n", + "└──────────┴────────┴──────────┴───────┘" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " rides.filter(pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n", + " .filter(pl.col(\"drop_off\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n", + " .collect()\n", + " .pivot(\n", + " on=\"pick_up\",\n", + " index=\"drop_off\",\n", + " values=\"passengers\",\n", + " aggregate_function=\"sum\",\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0f17622f-c69d-4b3c-8113-0a6aec912089", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 4)
drop_offQueensBrooklynBronx
stri32i32i32
"Queens"115640188541785
" + ], + "text/plain": [ + "shape: (1, 4)\n", + "┌──────────┬────────┬──────────┬───────┐\n", + "│ drop_off ┆ Queens ┆ Brooklyn ┆ Bronx │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ i32 ┆ i32 ┆ i32 │\n", + "╞══════════╪════════╪══════════╪═══════╡\n", + "│ Queens ┆ 115640 ┆ 18854 ┆ 1785 │\n", + "└──────────┴────────┴──────────┴───────┘" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " rides.filter(pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n", + " .filter(pl.col(\"drop_off\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n", + " .collect()\n", + " .pivot(\n", + " on=\"pick_up\",\n", + " index=\"drop_off\",\n", + " values=\"passengers\",\n", + " aggregate_function=\"sum\",\n", + " )\n", + " .lazy()\n", + " .select(pl.max(\"*\"))\n", + ").collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eb0bdf0d-7547-40dd-8ad5-9410ec34b9ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['corr', 'drop_in_place', 'equals', 'estimated_size', 'extend', 'flags', 'fold', 'get_column', 'get_column_index', 'get_columns', 'glimpse', 'hash_rows', 'height', 'hstack', 'insert_column', 'is_duplicated', 'is_empty', 'is_unique', 'item', 'iter_columns', 'iter_rows', 'iter_slices', 'map_rows', 'max_horizontal', 'mean_horizontal', 'min_horizontal', 'n_chunks', 'n_unique', 'partition_by', 'pivot', 'plot', 'product', 'rechunk', 'replace_column', 'row', 'rows', 'rows_by_key', 'sample', 'shape', 'shrink_to_fit', 'style', 'sum_horizontal', 'to_arrow', 'to_dict', 'to_dicts', 'to_dummies', 'to_init_repr', 'to_jax', 'to_numpy', 'to_pandas', 'to_series', 'to_struct', 'to_torch', 'transpose', 'unstack', 'upsample', 'vstack', 'write_avro', 'write_clipboard', 'write_csv', 'write_database', 'write_delta', 'write_excel', 'write_ipc', 'write_ipc_stream', 'write_json', 'write_ndjson', 'write_parquet']\n" + ] + } + ], + "source": [ + "df_ops = set(x for x in dir(pl.DataFrame()) if not x.startswith(\"_\"))\n", + "lazy_ops = set(x for x in dir(pl.LazyFrame()) if not x.startswith(\"_\"))\n", + "print(sorted(df_ops - lazy_ops))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98f141b2-3ed1-4dd0-91db-0573896153dc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}