diff --git a/duckdb/Code Download.ipynb b/duckdb/Code Download.ipynb new file mode 100644 index 0000000000..eb5d98c0ec --- /dev/null +++ b/duckdb/Code Download.ipynb @@ -0,0 +1,505 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d9515c08-c3cc-4b67-8298-cb5e6da8fae8", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install duckdb" + ] + }, + { + "cell_type": "markdown", + "id": "de891a8f-ef46-418e-81f1-bb400a20ffe0", + "metadata": {}, + "source": [ + "# Getting Started With DuckDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "542e9da3-c459-45de-84d6-7b6287422e4e", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "duckdb.sql(\"SELECT 'whistling_duck' AS waterfowl, 'whistle' AS call\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c9020b1-a8b7-4dde-a70d-7ddc35d23dac", + "metadata": {}, + "source": [ + "## Creating a Database From a Data Source" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ed567c9-06e3-4793-a43e-a6a9a985b4a8", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "conn = duckdb.connect(database=\"presidents.db\")\n", + "\n", + "presidents_relation = conn.read_parquet(\"presidents.parquet\")\n", + "\n", + "conn.sql(\n", + " \"\"\"\n", + " SELECT sequence, last_name, first_name\n", + " FROM presidents_relation\n", + " WHERE sequence <= 2\n", + " \"\"\"\n", + ").show()\n", + "\n", + "presidents_relation.to_table(\"presidents\")\n", + "\n", + "conn.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88f82cd9-9056-4cdc-9ff1-9827d0900792", + "metadata": {}, + "outputs": [], + "source": [ + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT last_name, first_name\n", + " FROM presidents\n", + " WHERE last_name = 'Adams' \n", + " \"\"\"\n", + " ).show()" + ] + }, + { + "cell_type": "markdown", + "id": "a4d1b371-f16e-4a81-85fe-37411b3b2824", + "metadata": {}, + "source": [ + "**This code won't work.**\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " conn.sql(\"SELECT * FROM presidents_relation\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "d87920b4-cf7b-462b-8f89-7ab40d6ba8fa", + "metadata": {}, + "source": [ + "## Correcting Your Data Import Errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "491262f1-8841-459a-9fff-4d9b23caa4bb", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " presidents_relation = conn.read_csv(\"presidents.csv\")\n", + " print(presidents_relation.limit(2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1b48835-4f78-4be1-93b9-4b563a91238b", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " presidents_relation = conn.read_csv(\n", + " \"presidents.csv\", date_format=\"%B %d %Y\"\n", + " )\n", + " print(presidents_relation.dtypes)" + ] + }, + { + "cell_type": "markdown", + "id": "281fc954-cbe8-4e13-bc35-13ead7d28fcb", + "metadata": {}, + "source": [ + "# Querying the Database\n", + "## Querying Tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8342da7-a20e-49f7-a156-18b875734f52", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "with duckdb.connect(database=\"presidents.db\") as conn:\n", + " conn.read_json(\"parties.json\").to_table(\"parties\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b795c6e-df4b-4fd7-b6cd-aff308740c1c", + "metadata": {}, + "outputs": [], + "source": [ + "with duckdb.connect(\"presidents.db\") as conn:\n", + " print(\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT first_name, last_name, party_name\n", + " FROM presidents\n", + " JOIN parties\n", + " ON presidents.party_id = parties.party_id\n", + " WHERE party_name = 'Whig'\n", + " ORDER BY last_name DESC\n", + " \"\"\"\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "5e7da17a-6653-4c6b-85e9-61bbaad49ce5", + "metadata": {}, + "source": [ + "## Querying Relations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11437a3f-c64e-4ba1-ab14-6d98f808c5f1", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "presidents = duckdb.read_parquet(\"presidents.parquet\")\n", + "parties = duckdb.read_json(\"parties.json\")\n", + "\n", + "duckdb.sql(\n", + " \"\"\"\n", + " SELECT first_name, last_name, party_name\n", + " FROM presidents\n", + " JOIN parties\n", + " ON presidents.party_id = parties.party_id\n", + " WHERE party_name = 'Whig'\n", + " ORDER BY last_name DESC\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2223996-ed7f-49cb-8aa7-34c4e4cf45c6", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "presidents = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"presidents\")\n", + "parties = duckdb.read_json(\"parties.json\").set_alias(\"parties\")\n", + "\n", + "(\n", + " presidents.join(parties, \"presidents.party_id = parties.party_id\")\n", + " .select(\"first_name\", \"last_name\", \"party_name\")\n", + " .filter(\"party_name = 'Whig'\")\n", + " .order(\"last_name DESC\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fffdaf5-447c-4918-9809-0a8f50620b05", + "metadata": {}, + "outputs": [], + "source": [ + "leaders = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"usa_presidents\")\n", + "faction = duckdb.read_json(\"parties.json\").set_alias(\"political_parties\")\n", + "\n", + "(\n", + " leaders.join(\n", + " faction, \"usa_presidents.party_id = political_parties.party_id\"\n", + " )\n", + " .select(\"first_name\", \"last_name\", \"party_name\")\n", + " .filter(\"party_name = 'Whig'\")\n", + " .order(\"last_name DESC\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8c300c6e-1b5e-41e6-bbdf-04ec2ce04b8c", + "metadata": {}, + "source": [ + "# Using Concurrency\n", + "## Performing Concurrent Reads" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03637669-43fb-4593-91e6-c44b425399b3", + "metadata": {}, + "outputs": [], + "source": [ + "from concurrent.futures import ThreadPoolExecutor\n", + "import duckdb\n", + "\n", + "\n", + "def read_data(thread_id):\n", + " print(f\"Thread {thread_id} starting its read.\")\n", + " with duckdb.connect(\"presidents.db\") as conn:\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT first_name, last_name\n", + " FROM presidents\n", + " WHERE sequence = 1\n", + " \"\"\"\n", + " ).show()\n", + " print(f\"Thread {thread_id} ending its read.\")\n", + "\n", + "\n", + "def concurrent_read():\n", + " with ThreadPoolExecutor(max_workers=3) as executor:\n", + " executor.map(read_data, range(3))\n", + "\n", + "\n", + "concurrent_read()" + ] + }, + { + "cell_type": "markdown", + "id": "c231f1c1-8aa4-4efe-b54e-4c0ff53d1d7e", + "metadata": {}, + "source": [ + "## Performing Concurrent Writes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65384b11-a04d-456a-956d-49ccc5057bf4", + "metadata": {}, + "outputs": [], + "source": [ + "from concurrent.futures import ThreadPoolExecutor\n", + "import duckdb\n", + "\n", + "\n", + "def update_data(thread_id):\n", + " new_name = f\"George ({thread_id})\"\n", + " with duckdb.connect(\"presidents.db\") as conn:\n", + " print(f\"Thread {thread_id} starting its update.\")\n", + " conn.sql(\n", + " f\"\"\"\n", + " UPDATE presidents\n", + " SET first_name = '{new_name}'\n", + " WHERE sequence = 1\n", + " \"\"\"\n", + " )\n", + " print(f\"Thread {thread_id} ending its update.\")\n", + "\n", + "\n", + "def concurrent_update():\n", + " with ThreadPoolExecutor(max_workers=3) as executor:\n", + " executor.map(update_data, range(3))\n", + "\n", + "\n", + "concurrent_update()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "892a7a41-89b2-4ceb-8e6c-221d417520d0", + "metadata": {}, + "outputs": [], + "source": [ + "with duckdb.connect(\"presidents.db\") as conn:\n", + " print(\n", + " conn.sql(\n", + " \"\"\"\n", + " SELECT last_name, first_name\n", + " FROM presidents\n", + " WHERE sequence = 1\n", + " \"\"\"\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "bd5f57cf-b8a0-4833-9675-bcc7a6907124", + "metadata": {}, + "source": [ + "# Integrating DuckDB Within Your Python Environment\n", + "## Creating Python Functions for DuckDB to Use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ecd1c32-608a-43a1-a3fd-25519585a4e5", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "\n", + "def short_name(first_name: str, last_name: str) -> str:\n", + " return f\"{first_name[0]}. {last_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49f50c03-2557-4637-98c9-e114a47de8f9", + "metadata": {}, + "outputs": [], + "source": [ + "short_name(\"Abraham\", \"Lincoln\")" + ] + }, + { + "cell_type": "markdown", + "id": "b2772cd3-0f3e-48ab-99c2-949468cb57e8", + "metadata": {}, + "source": [ + "**Only run this code if you need to update your existing `short_name()` function**\n", + "\n", + "duckdb.remove_function(\"short_name\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e001fe4a-498a-496e-901d-871b922e4231", + "metadata": {}, + "outputs": [], + "source": [ + "duckdb.create_function(\"short_name\", short_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6a94bdc-4a89-41b3-921a-8142b43dcebe", + "metadata": {}, + "outputs": [], + "source": [ + "presidents = duckdb.read_parquet(\"presidents.parquet\")\n", + "\n", + "duckdb.sql(\n", + " \"\"\" \n", + " SELECT short_name(first_name::VARCHAR, last_name::VARCHAR) AS name,\n", + " (term_end - term_start) AS \"days in office\"\n", + " FROM presidents\n", + " \"\"\"\n", + ").limit(3)" + ] + }, + { + "cell_type": "markdown", + "id": "2181faed-0d54-470c-9174-f8f0a681069f", + "metadata": {}, + "source": [ + "# Using Polars and pandas With DuckDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3934e0e3-7875-40ad-9882-a06c87030c6f", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pandas polars pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77a11ef6-308e-4279-94f1-a38b1b046874", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "with duckdb.connect(\"presidents.db\") as conn:\n", + " pandas_df = conn.sql(\n", + " \"\"\"\n", + " SELECT last_name, first_name\n", + " FROM presidents\n", + " WHERE sequence BETWEEN 2 AND 5\n", + " \"\"\"\n", + " ).df()\n", + "\n", + "pandas_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d348d5dc-ff6a-4a61-9924-24ff345a63bd", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "presidents = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"presidents\")\n", + "parties = duckdb.read_json(\"parties.json\").set_alias(\"parties\")\n", + "\n", + "(\n", + " presidents.join(parties, \"presidents.party_id = parties.party_id\")\n", + " .select(\"first_name\", \"last_name\", \"party_name\")\n", + " .order(\"last_name DESC\")\n", + ").pl().head(3)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/duckdb/README.md b/duckdb/README.md new file mode 100644 index 0000000000..6663e007c5 --- /dev/null +++ b/duckdb/README.md @@ -0,0 +1,9 @@ +These files will allow you to work along with the [Introducing DuckDB](https://realpython.com/introducing-duckdb/) tutorial. + +The files are: + +Code Download.ipynb - Contains the code you see in the tutorial. +parties.json - JSON file containing political party information. +presidents.csv - CSV file containing president information. +presidents.parquet - Parquet file containing president information. + \ No newline at end of file diff --git a/duckdb/parties.json b/duckdb/parties.json new file mode 100644 index 0000000000..b572389988 --- /dev/null +++ b/duckdb/parties.json @@ -0,0 +1,6 @@ +{"party_id":10,"party_name":"Unaffiliated"} +{"party_id":20,"party_name":"Federalist"} +{"party_id":30,"party_name":"Democratic Republican"} +{"party_id":40,"party_name":"Democratic"} +{"party_id":50,"party_name":"Whig"} +{"party_id":60,"party_name":"Republican"} \ No newline at end of file diff --git a/duckdb/presidents.csv b/duckdb/presidents.csv new file mode 100644 index 0000000000..313ff1342a --- /dev/null +++ b/duckdb/presidents.csv @@ -0,0 +1,48 @@ +sequence,last_name,first_name,term_start,term_end,party_id +1,Washington,George,April 30 1789,March 04 1797,10 +2,Adams,John,March 04 1797,March 04 1801,20 +3,Jefferson,Thomas,March 04 1801,March 04 1809,30 +4,Madison,James,March 04 1809,March 04 1817,30 +5,Monroe,James,March 04 1817,March 04 1825,30 +6,Adams,John Quincy,March 04 1825,March 04 1829,30 +7,Jackson,Andrew,March 04 1829,March 04 1837,40 +8,Van Buren,Martin,March 04 1837,March 04 1841,40 +9,Harrison,William Henry,March 04 1841,April 04 1841,50 +10,Tyler,John,April 04 1841,March 04 1845,50 +11,Polk,James K.,March 04 1845,March 04 1849,40 +12,Taylor,Zachary,March 04 1849,July 09 1850,50 +13,Fillmore,Millard,July 09 1850,March 04 1853,50 +14,Pierce,Franklin,March 04 1853,March 04 1857,40 +15,Buchanan,James,March 04 1857,March 04 1861,40 +16,Lincoln,Abraham,March 04 1861,April 15 1865,60 +17,Johnson,Andrew,April 15 1865,March 04 1869,40 +18,Grant,Ulysses S,March 04 1869,March 04 1877,60 +19,Hayes,Rutherford B,March 04 1877,March 04 1881,60 +20,Garfield,James,March 04 1881,September 19 1881,60 +21,Arthur,Chester A,September 19 1881,March 04 1885,60 +22,Cleveland,Grover,March 04 1885,March 04 1889,40 +23,Harrison,Benjamin,March 04 1889,March 04 1893,60 +24,Cleveland,Grover,March 04 1893,March 04 1897,40 +25,McKinlay,William,March 04 1897,September 14 1901,60 +26,Roosevelt,Theodore,September 14 1901,March 04 1909,60 +27,Taft,Willam H.,March 04 1909,March 04 1913,60 +28,Wilson,Woodrow,March 04 1913,March 04 1921,40 +29,Harding,Warren G.,March 04 1921,August 02 1923,60 +30,Coolidge,Calvin,August 02 1923,March 04 1929,60 +31,Hoover,Herbert,March 04 1929,March 04 1933,60 +32,Roosevelt,Franklin D.,March 04 1933,April 12 1945,40 +33,Truman,Harry S,April 12 1945,January 20 1953,40 +34,Eisenhower,Dwight D.,January 20 1953,January 20 1961,60 +35,Kennedy,John F.,January 20 1961,November 22 1963,40 +36,Johnson,Lyndon,November 22 1963,January 20 1969,40 +37,Nixon,Richard,January 20 1969,August 09 1974,60 +38,Ford,Gerald,August 09 1974,January 20 1977,60 +39,Carter,Jimmy,January 20 1977,January 20 1981,40 +40,Reagan,Ronald,January 20 1981,January 20 1989,60 +41,Bush,George H. W.,January 20 1989,January 20 1993,60 +42,Clinton,Bill,January 20 1993,January 20 2001,40 +43,Bush,George W.,January 20 2001,January 20 2009,60 +44,Obama,Barack,January 20 2009,January 20 2017,40 +45,Trump,Donald,January 20 2017,January 20 2021,60 +46,Biden,Joe,January 20 2021,January 20 2025,40 +47,Trump,Donald,January 20 2025,January 20 2029,60 diff --git a/duckdb/presidents.parquet b/duckdb/presidents.parquet new file mode 100644 index 0000000000..aa749a36e4 Binary files /dev/null and b/duckdb/presidents.parquet differ