Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions polars-lazyframe/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# How to Work With Polars LazyFrames

This folder contains completed notebooks and other files used in the Real Python tutorial [How to Work With Polars LazyFrames](https://realpython.com/how-to-work-with-polars-lazyframe/).

**The following files are included:**

- `tutorial_code.ipynb` is a Jupyter Notebook containing all the code used in the tutorial.
- `rides.parquet` contains taxi fare data used throughout the tutorial. [See below]
- `taxi_rides_nnGB.py` are a series of Python scripts that will generate a local copy of the `2021_Yellow_Taxi_Trip_Data.csv` in different sizes. This is used to illustrate data streaming and is provided to help demonstrate streaming.
- `programming_languages.csv` is a small file used to illustrate the conversion between a DataFrame and a LazyFrame.
- `dataframe_timer.py` and `lazyframe_timer.py` are two scripts used to compare the speed differences when using both a DataFrame and LazyFrame to perform an analysis.
22 changes: 22 additions & 0 deletions polars-lazyframe/dataframe_timer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import time

import polars as pl

start = time.perf_counter()

for _ in range(10):
rides = pl.read_parquet("rides.parquet")
result = (
rides.filter(pl.col("pick_up") == pl.col("drop_off"))
.group_by(pl.col("pick_up"))
.agg(pl.col("fare").mean())
.filter(
pl.col("pick_up").is_in(
["Brooklyn", "Bronx", "Queens", "Manhattan"]
)
)
)

end = time.perf_counter()

f"Code finished in {(end - start)/10:0.4f} seconds."
22 changes: 22 additions & 0 deletions polars-lazyframe/lazyframe_timer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import time

import polars as pl

start = time.perf_counter()

for _ in range(10):
rides = pl.scan_parquet("rides.parquet")
result = (
rides.filter(pl.col("pick_up") == pl.col("drop_off"))
.group_by(pl.col("pick_up"))
.agg(pl.col("fare").mean())
.filter(
pl.col("pick_up").is_in(
["Brooklyn", "Bronx", "Queens", "Manhattan"]
)
)
).collect()

end = time.perf_counter()

f"Code finished in {(end - start)/10:0.4f} seconds."
6 changes: 6 additions & 0 deletions polars-lazyframe/programming_languages.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
language_id,language,creator,year
0,Pascal,Niklaus Wirth,1970
1,C,Dennis Ritchie,1973
2,C++,Bjarne Stroustrup,1985
3,Python,Guido van Rossum,1991
4,Java,James Gosling,1995
Binary file added polars-lazyframe/rides.parquet
Binary file not shown.
87 changes: 87 additions & 0 deletions polars-lazyframe/taxi_rides_16GB.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import csv
import datetime
import random

random.seed(10)


def generate_pu():
lower_bound = datetime.datetime(
year=2021, month=1, day=1, hour=0, minute=0, second=0
)
upper_bound = datetime.datetime(
year=2021, month=12, day=31, hour=23, minute=59, second=59
)
random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
return random_pickup.replace(microsecond=0)


def generate_do(pick_up):
random_dropoff = pick_up + datetime.timedelta(
minutes=random.randint(10, 120), seconds=random.randint(0, 59)
)
return random_dropoff


def generate_choice(*kwargs):
return random.choice(kwargs)


def generate_file():
with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
header = (
"VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
"trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
"payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
"improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
).split(",")

writer = csv.writer(csvfile)
writer.writerow(header)

for _ in range(170_000_000):
fare_amount = round(random.uniform(5.0, 100.0), 2)
extra = generate_choice(0, 0.5, 3)
mta_tax = generate_choice(0, 0.5)
tip_amount = round(random.uniform(0.0, 20.0), 2)
tolls_amount = generate_choice(0, 6.12)
improvement_surcharge = 0.3
congestion_surcharge = 2.5
total_amount = round(
fare_amount
+ extra
+ mta_tax
+ tip_amount
+ tolls_amount
+ improvement_surcharge
+ congestion_surcharge,
2,
)

pick_up = generate_pu()

ride = [
random.randint(0, 6), # VendorID
pick_up, # tpep_pickup_datetime
generate_do(pick_up), # tpep_dropoff_datetime
random.randint(0, 5), # passenger_count
random.randint(0, 15), # trip_distance
random.randint(0, 6), # RatecodeID
generate_choice("Y", "N"), # store_and_fwd_flag
random.randint(1, 265), # PULocationID
random.randint(1, 265), # DOLocationID
random.randint(0, 5), # payment_type
fare_amount, # fare_amount
extra, # extra
mta_tax, # mta_tax
tip_amount, # tip_amount
tolls_amount, # tolls_amount
improvement_surcharge, # improvement_surcharge
total_amount, # total_amount
congestion_surcharge, # congestion_surcharge
]

writer.writerow(ride)


generate_file()
87 changes: 87 additions & 0 deletions polars-lazyframe/taxi_rides_24GB.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import csv
import datetime
import random

random.seed(10)


def generate_pu():
lower_bound = datetime.datetime(
year=2021, month=1, day=1, hour=0, minute=0, second=0
)
upper_bound = datetime.datetime(
year=2021, month=12, day=31, hour=23, minute=59, second=59
)
random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
return random_pickup.replace(microsecond=0)


def generate_do(pick_up):
random_dropoff = pick_up + datetime.timedelta(
minutes=random.randint(10, 120), seconds=random.randint(0, 59)
)
return random_dropoff


def generate_choice(*kwargs):
return random.choice(kwargs)


def generate_file():
with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
header = (
"VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
"trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
"payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
"improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
).split(",")

writer = csv.writer(csvfile)
writer.writerow(header)

for _ in range(255_000_000):
fare_amount = round(random.uniform(5.0, 100.0), 2)
extra = generate_choice(0, 0.5, 3)
mta_tax = generate_choice(0, 0.5)
tip_amount = round(random.uniform(0.0, 20.0), 2)
tolls_amount = generate_choice(0, 6.12)
improvement_surcharge = 0.3
congestion_surcharge = 2.5
total_amount = round(
fare_amount
+ extra
+ mta_tax
+ tip_amount
+ tolls_amount
+ improvement_surcharge
+ congestion_surcharge,
2,
)

pick_up = generate_pu()

ride = [
random.randint(0, 6), # VendorID
pick_up, # tpep_pickup_datetime
generate_do(pick_up), # tpep_dropoff_datetime
random.randint(0, 5), # passenger_count
random.randint(0, 15), # trip_distance
random.randint(0, 6), # RatecodeID
generate_choice("Y", "N"), # store_and_fwd_flag
random.randint(1, 265), # PULocationID
random.randint(1, 265), # DOLocationID
random.randint(0, 5), # payment_type
fare_amount, # fare_amount
extra, # extra
mta_tax, # mta_tax
tip_amount, # tip_amount
tolls_amount, # tolls_amount
improvement_surcharge, # improvement_surcharge
total_amount, # total_amount
congestion_surcharge, # congestion_surcharge
]

writer.writerow(ride)


generate_file()
87 changes: 87 additions & 0 deletions polars-lazyframe/taxi_rides_32GB.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import csv
import datetime
import random

random.seed(10)


def generate_pu():
lower_bound = datetime.datetime(
year=2021, month=1, day=1, hour=0, minute=0, second=0
)
upper_bound = datetime.datetime(
year=2021, month=12, day=31, hour=23, minute=59, second=59
)
random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
return random_pickup.replace(microsecond=0)


def generate_do(pick_up):
random_dropoff = pick_up + datetime.timedelta(
minutes=random.randint(10, 120), seconds=random.randint(0, 59)
)
return random_dropoff


def generate_choice(*kwargs):
return random.choice(kwargs)


def generate_file():
with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
header = (
"VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
"trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
"payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
"improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
).split(",")

writer = csv.writer(csvfile)
writer.writerow(header)

for _ in range(340_000_000):
fare_amount = round(random.uniform(5.0, 100.0), 2)
extra = generate_choice(0, 0.5, 3)
mta_tax = generate_choice(0, 0.5)
tip_amount = round(random.uniform(0.0, 20.0), 2)
tolls_amount = generate_choice(0, 6.12)
improvement_surcharge = 0.3
congestion_surcharge = 2.5
total_amount = round(
fare_amount
+ extra
+ mta_tax
+ tip_amount
+ tolls_amount
+ improvement_surcharge
+ congestion_surcharge,
2,
)

pick_up = generate_pu()

ride = [
random.randint(0, 6), # VendorID
pick_up, # tpep_pickup_datetime
generate_do(pick_up), # tpep_dropoff_datetime
random.randint(0, 5), # passenger_count
random.randint(0, 15), # trip_distance
random.randint(0, 6), # RatecodeID
generate_choice("Y", "N"), # store_and_fwd_flag
random.randint(1, 265), # PULocationID
random.randint(1, 265), # DOLocationID
random.randint(0, 5), # payment_type
fare_amount, # fare_amount
extra, # extra
mta_tax, # mta_tax
tip_amount, # tip_amount
tolls_amount, # tolls_amount
improvement_surcharge, # improvement_surcharge
total_amount, # total_amount
congestion_surcharge, # congestion_surcharge
]

writer.writerow(ride)


generate_file()
Loading
Loading