Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions polars-vs-pandas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Polars vs pandas: What's the Difference?

The materials contained in this folder are designed to complement the Real Python tutorial [Polars vs pandas: What's the Difference?](https://realpython.com/polars-vs-pandas/).

Your download bundle contains the following files:

| File | Description |
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|
| `benchmark.py` | This script performs time tests for DataFrames and a LazyFrame. |
| `conversions.py` | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. |
| `data_generation.py` | This script contains the `generate_data()` function used to generate different quantities of data. |
| `online_retail.parquet` | This Parquet file contains retail data used in some of the queries. |
| `pandas_polars_demo.py` | This file contains the code used to illustrate the differences between pandas and Polars syntax. |
| `plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. |
| `streaming_test.py` | This script performs time tests for a LazyFrame with streaming enabled. |
103 changes: 103 additions & 0 deletions polars-vs-pandas/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
Running:
$ python benchmark.py 500
"""

import functools
import sys
from timeit import Timer

import pandas as pd
import polars as pl
from data_generation import generate_data


def create_pandas_dataframe(test_data):
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")


def create_polars_dataframe(test_data):
return pl.DataFrame(test_data)


def create_polars_lazyframe(test_data):
return pl.LazyFrame(test_data)


def analyze_pandas_dataframe(pandas_df):
return pandas_df.groupby(["region", "product", "sales_person"])[
"sales_income"
].sum()


def analyze_polars_dataframe(polars_df):
return polars_df.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)


def analyze_polars_lazyframe(polars_lf):
return (
polars_lf.group_by(["region", "product", "sales_person"])
.agg(total_sales=pl.col("sales_income").sum())
.collect()
)


print("Creating DataFrames...")

test_data = generate_data(int(sys.argv[1]))

print(f"Pandas dataframe creation time for {int(sys.argv[1]):,} rows:")
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
print(f"\nPolars dataframe creation time for {int(sys.argv[1]):,} rows:")
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
print(f"\nPolars lazyframe creation time for {int(sys.argv[1]):,} rows:")
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))

print("-" * 50)
print("Analyzing DataFrames...")

pandas_df = create_pandas_dataframe(test_data)
polars_df = create_polars_dataframe(test_data)
polars_lf = create_polars_lazyframe(test_data)

print(f"Pandas dataframe analysis time for {int(sys.argv[1]):,} rows:")
print(
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
)

print()
print(f"Polars dataframe analysis time for {int(sys.argv[1]):,} rows:")
print(
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
)

print()
print(f"Polars lazyframe analysis time for {int(sys.argv[1]):,} rows:")
print(
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
)

print("\nShow Boots sales in the East region for pandas DataFrame")
print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])

print("\nShow Boots sales in the East region for Polars DataFrame")
print(
(
analyze_polars_dataframe(polars_df).filter(
pl.col("region") == "East",
pl.col("product") == "Boots",
)
)
)

print("\nShow Boots sales in the East region for Polars LazyFrame")
print(
(
analyze_polars_lazyframe(polars_lf).filter(
pl.col("region") == "East",
pl.col("product") == "Boots",
)
)
)
33 changes: 33 additions & 0 deletions polars-vs-pandas/conversions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import narwhals as nw
import polars as pl
from data_generation import generate_data


def universal_groupby(df):
return (
nw.from_native(df)
.group_by("region")
.agg(nw.col("sales_income").sum())
.sort("region")
.to_native()
)


polars_df = pl.DataFrame(generate_data(4))
print(polars_df)

print("\nPolars to pandas:")
pandas_df = polars_df.to_pandas()
print(type(pandas_df))
print(pandas_df)

print("\npandas to Polars:")
polars_df = pl.from_pandas(pandas_df)
print(type(polars_df))
print(polars_df)

print("\nNarwhals with pandas:")
print(universal_groupby(pandas_df))

print("\nNarwhals with Polars:")
print(universal_groupby(polars_df))
19 changes: 19 additions & 0 deletions polars-vs-pandas/data_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np


def generate_data(number_of_rows):
rng = np.random.default_rng()

return {
"order_id": range(1, number_of_rows + 1),
"region": rng.choice(
["North", "South", "East", "West"], size=number_of_rows
),
"sales_person": rng.choice(
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
),
"product": rng.choice(
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
),
"sales_income": rng.integers(1, 5001, size=number_of_rows),
}
Binary file added polars-vs-pandas/online_retail.parquet
Binary file not shown.
38 changes: 38 additions & 0 deletions polars-vs-pandas/pandas_polars_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd
import polars as pl

print("Index-Based syntax in pandas:")
orders_pandas = pd.read_parquet("online_retail.parquet")
orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
print(
orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
orders_pandas["Total"] > 100
].head(3)
)

print()

print("Method-chaining syntax in pandas:")
orders_pandas = pd.read_parquet("online_retail.parquet")
print(
(
orders_pandas.assign(
Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
)
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
.query("Total > 100")
).head(3)
)

print()

print("Method-chaining syntax in Polars:")
orders_polars = pl.read_parquet("online_retail.parquet")
print(
(
orders_polars.select(
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
Total=pl.col("Quantity") * pl.col("UnitPrice"),
).filter(pl.col("Total") > 100)
).head(3)
)
102 changes: 102 additions & 0 deletions polars-vs-pandas/plots.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8ced8243-d770-437e-a90d-f794ffa57fc0",
"metadata": {},
"source": [
"# Dataframe Plots"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cf56fd3-605c-4449-8a5e-d0fd94b49080",
"metadata": {},
"outputs": [],
"source": [
"from data_generation import generate_data\n",
"\n",
"sales_data = generate_data(50)"
]
},
{
"cell_type": "markdown",
"id": "913c18ed-373b-400e-ba27-38ca8e7b70b9",
"metadata": {},
"source": [
"## polars Plotting"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "014e9e56-8fff-45ab-85ff-eb51840f2bc7",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"\n",
"orders_polars = pl.DataFrame(sales_data)\n",
"\n",
"(\n",
" orders_polars.group_by(\"region\")\n",
" .agg(total_sales=pl.col(\"sales_income\").sum())\n",
" .plot.bar(x=\"region\", y=\"total_sales\")\n",
" .properties(width=200, height=200, title=\"Total Sales per Region ($)\")\n",
")"
]
},
{
"cell_type": "markdown",
"id": "a62335eb-4763-4c46-adb2-d7386914f56b",
"metadata": {},
"source": [
"## Pandas Plotting"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85929590-e514-4497-b396-58cfe26e59d3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"orders_pandas = pd.DataFrame(sales_data)\n",
"\n",
"(\n",
" orders_pandas.groupby(\n",
" [\n",
" \"region\",\n",
" ]\n",
" )[\"sales_income\"]\n",
" .sum()\n",
" .plot(kind=\"bar\", title=\"Total Sales per Region ($)\", ylabel=\"total_sales\")\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading