diff --git a/polars-vs-pandas/README.md b/polars-vs-pandas/README.md new file mode 100644 index 0000000000..bea017ae09 --- /dev/null +++ b/polars-vs-pandas/README.md @@ -0,0 +1,15 @@ +# Polars vs pandas: What's the Difference? + +The materials contained in this folder are designed to complement the Real Python tutorial [Polars vs pandas: What's the Difference?](https://realpython.com/polars-vs-pandas/). + +Your download bundle contains the following files: + +| File | Description | +|-----------------------------------------|------------------------------------------------------------------------------------------------------------| +| `benchmark.py` | This script performs time tests for DataFrames and a LazyFrame. | +| `conversions.py` | This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example. | +| `data_generation.py` | This script contains the `generate_data()` function used to generate different quantities of data. | +| `online_retail.parquet` | This Parquet file contains retail data used in some of the queries. | +| `pandas_polars_demo.py` | This file contains the code used to illustrate the differences between pandas and Polars syntax. | +| `plots.ipynb` | This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities. | +| `streaming_test.py` | This script performs time tests for a LazyFrame with streaming enabled. | diff --git a/polars-vs-pandas/benchmark.py b/polars-vs-pandas/benchmark.py new file mode 100644 index 0000000000..f9365e78b1 --- /dev/null +++ b/polars-vs-pandas/benchmark.py @@ -0,0 +1,103 @@ +""" +Running: +$ python benchmark.py 500 +""" + +import functools +import sys +from timeit import Timer + +import pandas as pd +import polars as pl +from data_generation import generate_data + + +def create_pandas_dataframe(test_data): + return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow") + + +def create_polars_dataframe(test_data): + return pl.DataFrame(test_data) + + +def create_polars_lazyframe(test_data): + return pl.LazyFrame(test_data) + + +def analyze_pandas_dataframe(pandas_df): + return pandas_df.groupby(["region", "product", "sales_person"])[ + "sales_income" + ].sum() + + +def analyze_polars_dataframe(polars_df): + return polars_df.group_by(["region", "product", "sales_person"]).agg( + total_sales=pl.col("sales_income").sum() + ) + + +def analyze_polars_lazyframe(polars_lf): + return ( + polars_lf.group_by(["region", "product", "sales_person"]) + .agg(total_sales=pl.col("sales_income").sum()) + .collect() + ) + + +print("Creating DataFrames...") + +test_data = generate_data(int(sys.argv[1])) + +print(f"Pandas dataframe creation time for {int(sys.argv[1]):,} rows:") +print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100)) +print(f"\nPolars dataframe creation time for {int(sys.argv[1]):,} rows:") +print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100)) +print(f"\nPolars lazyframe creation time for {int(sys.argv[1]):,} rows:") +print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100)) + +print("-" * 50) +print("Analyzing DataFrames...") + +pandas_df = create_pandas_dataframe(test_data) +polars_df = create_polars_dataframe(test_data) +polars_lf = create_polars_lazyframe(test_data) + +print(f"Pandas dataframe analysis time for {int(sys.argv[1]):,} rows:") +print( + Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100) +) + +print() +print(f"Polars dataframe analysis time for {int(sys.argv[1]):,} rows:") +print( + Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100) +) + +print() +print(f"Polars lazyframe analysis time for {int(sys.argv[1]):,} rows:") +print( + Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100) +) + +print("\nShow Boots sales in the East region for pandas DataFrame") +print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"]) + +print("\nShow Boots sales in the East region for Polars DataFrame") +print( + ( + analyze_polars_dataframe(polars_df).filter( + pl.col("region") == "East", + pl.col("product") == "Boots", + ) + ) +) + +print("\nShow Boots sales in the East region for Polars LazyFrame") +print( + ( + analyze_polars_lazyframe(polars_lf).filter( + pl.col("region") == "East", + pl.col("product") == "Boots", + ) + ) +) diff --git a/polars-vs-pandas/conversions.py b/polars-vs-pandas/conversions.py new file mode 100644 index 0000000000..65c8e47857 --- /dev/null +++ b/polars-vs-pandas/conversions.py @@ -0,0 +1,33 @@ +import narwhals as nw +import polars as pl +from data_generation import generate_data + + +def universal_groupby(df): + return ( + nw.from_native(df) + .group_by("region") + .agg(nw.col("sales_income").sum()) + .sort("region") + .to_native() + ) + + +polars_df = pl.DataFrame(generate_data(4)) +print(polars_df) + +print("\nPolars to pandas:") +pandas_df = polars_df.to_pandas() +print(type(pandas_df)) +print(pandas_df) + +print("\npandas to Polars:") +polars_df = pl.from_pandas(pandas_df) +print(type(polars_df)) +print(polars_df) + +print("\nNarwhals with pandas:") +print(universal_groupby(pandas_df)) + +print("\nNarwhals with Polars:") +print(universal_groupby(polars_df)) diff --git a/polars-vs-pandas/data_generation.py b/polars-vs-pandas/data_generation.py new file mode 100644 index 0000000000..9afa9f1309 --- /dev/null +++ b/polars-vs-pandas/data_generation.py @@ -0,0 +1,19 @@ +import numpy as np + + +def generate_data(number_of_rows): + rng = np.random.default_rng() + + return { + "order_id": range(1, number_of_rows + 1), + "region": rng.choice( + ["North", "South", "East", "West"], size=number_of_rows + ), + "sales_person": rng.choice( + ["Armstrong", "Aldrin", "Collins"], size=number_of_rows + ), + "product": rng.choice( + ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows + ), + "sales_income": rng.integers(1, 5001, size=number_of_rows), + } diff --git a/polars-vs-pandas/online_retail.parquet b/polars-vs-pandas/online_retail.parquet new file mode 100644 index 0000000000..30f6193e23 Binary files /dev/null and b/polars-vs-pandas/online_retail.parquet differ diff --git a/polars-vs-pandas/pandas_polars_demo.py b/polars-vs-pandas/pandas_polars_demo.py new file mode 100644 index 0000000000..16058cae09 --- /dev/null +++ b/polars-vs-pandas/pandas_polars_demo.py @@ -0,0 +1,38 @@ +import pandas as pd +import polars as pl + +print("Index-Based syntax in pandas:") +orders_pandas = pd.read_parquet("online_retail.parquet") +orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"] +print( + orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][ + orders_pandas["Total"] > 100 + ].head(3) +) + +print() + +print("Method-chaining syntax in pandas:") +orders_pandas = pd.read_parquet("online_retail.parquet") +print( + ( + orders_pandas.assign( + Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"] + ) + .filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"]) + .query("Total > 100") + ).head(3) +) + +print() + +print("Method-chaining syntax in Polars:") +orders_polars = pl.read_parquet("online_retail.parquet") +print( + ( + orders_polars.select( + pl.col(["InvoiceNo", "Quantity", "UnitPrice"]), + Total=pl.col("Quantity") * pl.col("UnitPrice"), + ).filter(pl.col("Total") > 100) + ).head(3) +) diff --git a/polars-vs-pandas/plots.ipynb b/polars-vs-pandas/plots.ipynb new file mode 100644 index 0000000000..0ef32c58dc --- /dev/null +++ b/polars-vs-pandas/plots.ipynb @@ -0,0 +1,102 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8ced8243-d770-437e-a90d-f794ffa57fc0", + "metadata": {}, + "source": [ + "# Dataframe Plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf56fd3-605c-4449-8a5e-d0fd94b49080", + "metadata": {}, + "outputs": [], + "source": [ + "from data_generation import generate_data\n", + "\n", + "sales_data = generate_data(50)" + ] + }, + { + "cell_type": "markdown", + "id": "913c18ed-373b-400e-ba27-38ca8e7b70b9", + "metadata": {}, + "source": [ + "## polars Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "014e9e56-8fff-45ab-85ff-eb51840f2bc7", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "orders_polars = pl.DataFrame(sales_data)\n", + "\n", + "(\n", + " orders_polars.group_by(\"region\")\n", + " .agg(total_sales=pl.col(\"sales_income\").sum())\n", + " .plot.bar(x=\"region\", y=\"total_sales\")\n", + " .properties(width=200, height=200, title=\"Total Sales per Region ($)\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a62335eb-4763-4c46-adb2-d7386914f56b", + "metadata": {}, + "source": [ + "## Pandas Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85929590-e514-4497-b396-58cfe26e59d3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "orders_pandas = pd.DataFrame(sales_data)\n", + "\n", + "(\n", + " orders_pandas.groupby(\n", + " [\n", + " \"region\",\n", + " ]\n", + " )[\"sales_income\"]\n", + " .sum()\n", + " .plot(kind=\"bar\", title=\"Total Sales per Region ($)\", ylabel=\"total_sales\")\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/polars-vs-pandas/requirements.txt b/polars-vs-pandas/requirements.txt new file mode 100644 index 0000000000..d2b0874d8f --- /dev/null +++ b/polars-vs-pandas/requirements.txt @@ -0,0 +1,107 @@ +altair==5.5.0 +anyio==4.10.0 +argon2-cffi==25.1.0 +argon2-cffi-bindings==25.1.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.5 +attrs==25.3.0 +babel==2.17.0 +beautifulsoup4==4.13.5 +bleach==6.2.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +comm==0.2.3 +contourpy==1.3.3 +cycler==0.12.1 +debugpy==1.8.17 +decorator==5.2.1 +defusedxml==0.7.1 +executing==2.2.1 +fastjsonschema==2.21.2 +fonttools==4.60.0 +fqdn==1.5.1 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +idna==3.10 +ipykernel==6.30.1 +ipython==9.5.0 +ipython_pygments_lexers==1.1.1 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.6 +json5==0.12.1 +jsonpointer==3.0.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +jupyter-events==0.12.0 +jupyter-lsp==2.3.0 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyter_server==2.17.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.4.7 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +kiwisolver==1.4.9 +lark==1.2.2 +MarkupSafe==3.0.2 +matplotlib==3.10.6 +matplotlib-inline==0.1.7 +mistune==3.1.4 +narwhals==2.5.0 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +numpy==2.3.3 +packaging==25.0 +pandas==2.3.2 +pandocfilters==1.5.1 +parso==0.8.5 +pexpect==4.9.0 +pillow==11.3.0 +platformdirs==4.4.0 +polars==1.33.1 +prometheus_client==0.23.1 +prompt_toolkit==3.0.52 +psutil==7.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pyarrow==21.0.0 +pycparser==2.23 +Pygments==2.19.2 +pyparsing==3.2.5 +python-dateutil==2.9.0.post0 +python-json-logger==3.3.0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==27.1.0 +referencing==0.36.2 +requests==2.32.5 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rfc3987-syntax==1.1.0 +rpds-py==0.27.1 +Send2Trash==1.8.3 +setuptools==80.9.0 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.8 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.4.0 +tornado==6.5.2 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20250822 +typing_extensions==4.15.0 +tzdata==2025.2 +uri-template==1.3.0 +urllib3==2.5.0 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 diff --git a/polars-vs-pandas/streaming_test.py b/polars-vs-pandas/streaming_test.py new file mode 100644 index 0000000000..b77f4adf32 --- /dev/null +++ b/polars-vs-pandas/streaming_test.py @@ -0,0 +1,37 @@ +import functools +import sys +from timeit import Timer + +import polars as pl +from data_generation import generate_data + + +def create_polars_lazyframe(test_data): + return pl.LazyFrame(test_data) + + +def analyze_polars_lazyframe(polars_lf): + polars_lf.group_by(["region", "product", "sales_person"]).agg( + total_sales=pl.col("sales_income").sum() + ).collect() + + +def analyze_polars_streaming(polars_lf): + polars_lf.group_by(["region", "product", "sales_person"]).agg( + total_sales=pl.col("sales_income").sum() + ).collect(engine="streaming") + + +test_data = generate_data(int(sys.argv[1])) + +polars_lf = create_polars_lazyframe(test_data) + +print(f"Polars lazyframe analysis time for {int(sys.argv[1]):,} rows:") +print( + Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100) +) + +print(f"\nPolars streaming analysis time for {int(sys.argv[1]):,} rows:") +print( + Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100) +)