Skip to content

Commit 6eb9ce6

Browse files
committed
[add] pandas group and categorical data type
1 parent 808d877 commit 6eb9ce6

File tree

2 files changed

+327
-1
lines changed

2 files changed

+327
-1
lines changed

README.md

+16-1
Original file line numberDiff line numberDiff line change
@@ -1658,14 +1658,29 @@ s.str.split("-").str.get(0)
16581658
## [Concat and Merge](pandas/concat_and_merge.ipynb)
16591659

16601660
``` py
1661+
# Concat rows
16611662
pd.concat([df[:3], df.iloc[7:, :2]])
16621663

1664+
# Merge two DataFrame
16631665
pd.merge(df, df2, on="name", how="right")
16641666
```
16651667

1666-
## Grouping and Categorical
1668+
## [Grouping and Categorical Data Type](pandas/grouping_categorical.ipynb)
16671669

1670+
``` py
1671+
# Groupby
1672+
df.groupby("col_A").sum()
1673+
df.groupby(["col_A", "col_B"]).max()
1674+
1675+
# Categorical - discrete
1676+
df["grade"] = df["grade"].astype("category")
1677+
df["grade"].cat.categories = ["Bad", "Good", "Excellent"]
1678+
df.sort_values(by="grade")
1679+
df.groupby("grade").size()
16681680

1681+
# Categorical - continuous
1682+
df["grade-labels"] = pd.cut(df["score"], bins=range(0, 120, 20), labels=list("EDCBA"))
1683+
```
16691684

16701685
# TODOs
16711686

pandas/grouping_categorical.ipynb

+311
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Grouping and Categorical Data Type"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 40,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import numpy as np\n",
17+
"import pandas as pd"
18+
]
19+
},
20+
{
21+
"cell_type": "markdown",
22+
"metadata": {},
23+
"source": [
24+
"## Grouping"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": 41,
30+
"metadata": {},
31+
"outputs": [
32+
{
33+
"data": {
34+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>A</th>\n <th>B</th>\n <th>C</th>\n <th>D</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>foo</td>\n <td>1</td>\n <td>0.773956</td>\n <td>0.652299</td>\n </tr>\n <tr>\n <th>1</th>\n <td>foo</td>\n <td>2</td>\n <td>0.438878</td>\n <td>0.043775</td>\n </tr>\n <tr>\n <th>2</th>\n <td>foo</td>\n <td>1</td>\n <td>0.858598</td>\n <td>0.020030</td>\n </tr>\n <tr>\n <th>3</th>\n <td>foo</td>\n <td>2</td>\n <td>0.697368</td>\n <td>0.839213</td>\n </tr>\n <tr>\n <th>4</th>\n <td>bar</td>\n <td>1</td>\n <td>0.094177</td>\n <td>0.587143</td>\n </tr>\n <tr>\n <th>5</th>\n <td>bar</td>\n <td>2</td>\n <td>0.975622</td>\n <td>0.224705</td>\n </tr>\n <tr>\n <th>6</th>\n <td>bar</td>\n <td>3</td>\n <td>0.761140</td>\n <td>0.751792</td>\n </tr>\n <tr>\n <th>7</th>\n <td>bar</td>\n <td>2</td>\n <td>0.786064</td>\n <td>0.263692</td>\n </tr>\n </tbody>\n</table>\n</div>",
35+
"text/plain": " A B C D\n0 foo 1 0.773956 0.652299\n1 foo 2 0.438878 0.043775\n2 foo 1 0.858598 0.020030\n3 foo 2 0.697368 0.839213\n4 bar 1 0.094177 0.587143\n5 bar 2 0.975622 0.224705\n6 bar 3 0.761140 0.751792\n7 bar 2 0.786064 0.263692"
36+
},
37+
"execution_count": 41,
38+
"metadata": {},
39+
"output_type": "execute_result"
40+
}
41+
],
42+
"source": [
43+
"df = pd.DataFrame(\n",
44+
" {\n",
45+
" \"A\": [\"foo\"] * 4 + [\"bar\"] * 4,\n",
46+
" \"B\": [1, 2, 1, 2, 1, 2, 3, 2],\n",
47+
" \"C\": np.random.default_rng(42).random(8),\n",
48+
" \"D\": np.random.default_rng(43).random(8),\n",
49+
" }\n",
50+
")\n",
51+
"df"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 42,
57+
"metadata": {},
58+
"outputs": [
59+
{
60+
"data": {
61+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>B</th>\n <th>C</th>\n <th>D</th>\n </tr>\n <tr>\n <th>A</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>bar</th>\n <td>8</td>\n <td>2.617004</td>\n <td>1.827333</td>\n </tr>\n <tr>\n <th>foo</th>\n <td>6</td>\n <td>2.768800</td>\n <td>1.555317</td>\n </tr>\n </tbody>\n</table>\n</div>",
62+
"text/plain": " B C D\nA \nbar 8 2.617004 1.827333\nfoo 6 2.768800 1.555317"
63+
},
64+
"execution_count": 42,
65+
"metadata": {},
66+
"output_type": "execute_result"
67+
}
68+
],
69+
"source": [
70+
"df.groupby(\"A\").sum()"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 43,
76+
"metadata": {},
77+
"outputs": [
78+
{
79+
"data": {
80+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>B</th>\n <th>C</th>\n <th>D</th>\n </tr>\n <tr>\n <th>A</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>bar</th>\n <td>3</td>\n <td>0.975622</td>\n <td>0.751792</td>\n </tr>\n <tr>\n <th>foo</th>\n <td>2</td>\n <td>0.858598</td>\n <td>0.839213</td>\n </tr>\n </tbody>\n</table>\n</div>",
81+
"text/plain": " B C D\nA \nbar 3 0.975622 0.751792\nfoo 2 0.858598 0.839213"
82+
},
83+
"execution_count": 43,
84+
"metadata": {},
85+
"output_type": "execute_result"
86+
}
87+
],
88+
"source": [
89+
"df.groupby(\"A\").max()"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 44,
95+
"metadata": {},
96+
"outputs": [
97+
{
98+
"data": {
99+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th></th>\n <th>C</th>\n <th>D</th>\n </tr>\n <tr>\n <th>A</th>\n <th>B</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th rowspan=\"3\" valign=\"top\">bar</th>\n <th>1</th>\n <td>0.094177</td>\n <td>0.587143</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.761687</td>\n <td>0.488397</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.761140</td>\n <td>0.751792</td>\n </tr>\n <tr>\n <th rowspan=\"2\" valign=\"top\">foo</th>\n <th>1</th>\n <td>1.632554</td>\n <td>0.672329</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.136246</td>\n <td>0.882988</td>\n </tr>\n </tbody>\n</table>\n</div>",
100+
"text/plain": " C D\nA B \nbar 1 0.094177 0.587143\n 2 1.761687 0.488397\n 3 0.761140 0.751792\nfoo 1 1.632554 0.672329\n 2 1.136246 0.882988"
101+
},
102+
"execution_count": 44,
103+
"metadata": {},
104+
"output_type": "execute_result"
105+
}
106+
],
107+
"source": [
108+
"df.groupby([\"A\", \"B\"]).sum()"
109+
]
110+
},
111+
{
112+
"cell_type": "markdown",
113+
"metadata": {},
114+
"source": [
115+
"## Categorical Data Type"
116+
]
117+
},
118+
{
119+
"cell_type": "markdown",
120+
"metadata": {},
121+
"source": [
122+
"### Discrete Values"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": 45,
128+
"metadata": {},
129+
"outputs": [
130+
{
131+
"data": {
132+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>grade</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>a</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>b</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>e</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>a</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>a</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>c</td>\n </tr>\n </tbody>\n</table>\n</div>",
133+
"text/plain": " id grade\n0 1 a\n1 2 b\n2 3 e\n3 4 a\n4 5 a\n5 6 c"
134+
},
135+
"execution_count": 45,
136+
"metadata": {},
137+
"output_type": "execute_result"
138+
}
139+
],
140+
"source": [
141+
"df = pd.DataFrame(\r\n",
142+
" {\"id\": [1, 2, 3, 4, 5, 6], \r\n",
143+
" \"grade\": [\"a\", \"b\", \"e\", \"a\", \"a\", \"c\"]}\r\n",
144+
")\r\n",
145+
"\r\n",
146+
"df[\"grade\"] = df[\"grade\"].astype(\"category\")\r\n",
147+
"df"
148+
]
149+
},
150+
{
151+
"cell_type": "code",
152+
"execution_count": 46,
153+
"metadata": {},
154+
"outputs": [
155+
{
156+
"data": {
157+
"text/plain": "0 a\n1 b\n2 e\n3 a\n4 a\n5 c\nName: grade, dtype: category\nCategories (4, object): ['a', 'b', 'c', 'e']"
158+
},
159+
"execution_count": 46,
160+
"metadata": {},
161+
"output_type": "execute_result"
162+
}
163+
],
164+
"source": [
165+
"df[\"grade\"]"
166+
]
167+
},
168+
{
169+
"cell_type": "code",
170+
"execution_count": 47,
171+
"metadata": {},
172+
"outputs": [
173+
{
174+
"data": {
175+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>grade</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>4</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>3</td>\n </tr>\n </tbody>\n</table>\n</div>",
176+
"text/plain": " id grade\n0 1 1\n1 2 2\n2 3 4\n3 4 1\n4 5 1\n5 6 3"
177+
},
178+
"execution_count": 47,
179+
"metadata": {},
180+
"output_type": "execute_result"
181+
}
182+
],
183+
"source": [
184+
"df[\"grade\"].cat.categories = [1, 2, 3, 4]\r\n",
185+
"df"
186+
]
187+
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": 48,
191+
"metadata": {},
192+
"outputs": [
193+
{
194+
"data": {
195+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>grade</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>3</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>",
196+
"text/plain": " id grade\n0 1 1\n3 4 1\n4 5 1\n1 2 2\n5 6 3\n2 3 4"
197+
},
198+
"execution_count": 48,
199+
"metadata": {},
200+
"output_type": "execute_result"
201+
}
202+
],
203+
"source": [
204+
"df.sort_values(by=\"grade\")"
205+
]
206+
},
207+
{
208+
"cell_type": "code",
209+
"execution_count": 49,
210+
"metadata": {},
211+
"outputs": [
212+
{
213+
"data": {
214+
"text/plain": "grade\n1 3\n2 1\n3 1\n4 1\ndtype: int64"
215+
},
216+
"execution_count": 49,
217+
"metadata": {},
218+
"output_type": "execute_result"
219+
}
220+
],
221+
"source": [
222+
"df.groupby(\"grade\").size()"
223+
]
224+
},
225+
{
226+
"cell_type": "markdown",
227+
"metadata": {},
228+
"source": [
229+
"### Continuous Values"
230+
]
231+
},
232+
{
233+
"cell_type": "code",
234+
"execution_count": 50,
235+
"metadata": {},
236+
"outputs": [
237+
{
238+
"data": {
239+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>8</td>\n </tr>\n <tr>\n <th>1</th>\n <td>77</td>\n </tr>\n <tr>\n <th>2</th>\n <td>65</td>\n </tr>\n <tr>\n <th>3</th>\n <td>43</td>\n </tr>\n <tr>\n <th>4</th>\n <td>43</td>\n </tr>\n <tr>\n <th>5</th>\n <td>85</td>\n </tr>\n <tr>\n <th>6</th>\n <td>8</td>\n </tr>\n <tr>\n <th>7</th>\n <td>69</td>\n </tr>\n </tbody>\n</table>\n</div>",
240+
"text/plain": " score\n0 8\n1 77\n2 65\n3 43\n4 43\n5 85\n6 8\n7 69"
241+
},
242+
"execution_count": 50,
243+
"metadata": {},
244+
"output_type": "execute_result"
245+
}
246+
],
247+
"source": [
248+
"df = pd.DataFrame({\r\n",
249+
" \"score\": np.random.default_rng(42).integers(0, 100, 8)\r\n",
250+
"})\r\n",
251+
"df"
252+
]
253+
},
254+
{
255+
"cell_type": "code",
256+
"execution_count": 51,
257+
"metadata": {},
258+
"outputs": [
259+
{
260+
"data": {
261+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>score</th>\n <th>grade-nolabels</th>\n <th>grade-labels</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>8</td>\n <td>(0, 20]</td>\n <td>E</td>\n </tr>\n <tr>\n <th>1</th>\n <td>77</td>\n <td>(60, 80]</td>\n <td>B</td>\n </tr>\n <tr>\n <th>2</th>\n <td>65</td>\n <td>(60, 80]</td>\n <td>B</td>\n </tr>\n <tr>\n <th>3</th>\n <td>43</td>\n <td>(40, 60]</td>\n <td>C</td>\n </tr>\n <tr>\n <th>4</th>\n <td>43</td>\n <td>(40, 60]</td>\n <td>C</td>\n </tr>\n <tr>\n <th>5</th>\n <td>85</td>\n <td>(80, 100]</td>\n <td>A</td>\n </tr>\n <tr>\n <th>6</th>\n <td>8</td>\n <td>(0, 20]</td>\n <td>E</td>\n </tr>\n <tr>\n <th>7</th>\n <td>69</td>\n <td>(60, 80]</td>\n <td>B</td>\n </tr>\n </tbody>\n</table>\n</div>",
262+
"text/plain": " score grade-nolabels grade-labels\n0 8 (0, 20] E\n1 77 (60, 80] B\n2 65 (60, 80] B\n3 43 (40, 60] C\n4 43 (40, 60] C\n5 85 (80, 100] A\n6 8 (0, 20] E\n7 69 (60, 80] B"
263+
},
264+
"execution_count": 51,
265+
"metadata": {},
266+
"output_type": "execute_result"
267+
}
268+
],
269+
"source": [
270+
"labels = list(\"EDCBA\") # divide score into five levels (0-20), (20-40), (40-60), (60-80), (80-100)\r\n",
271+
"df[\"grade-nolabels\"] = pd.cut(df[\"score\"], bins=range(0, 120, 20))\r\n",
272+
"df[\"grade-labels\"] = pd.cut(df[\"score\"], bins=range(0, 120, 20), labels=list(\"EDCBA\"))\r\n",
273+
"df"
274+
]
275+
},
276+
{
277+
"cell_type": "markdown",
278+
"metadata": {},
279+
"source": [
280+
"# Reference\r\n",
281+
"\r\n",
282+
"- https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#grouping\r\n",
283+
"- https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#categoricals\r\n",
284+
"- https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html\r\n",
285+
"- https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html"
286+
]
287+
}
288+
],
289+
"metadata": {
290+
"kernelspec": {
291+
"display_name": "Python 3",
292+
"language": "python",
293+
"name": "python3"
294+
},
295+
"language_info": {
296+
"codemirror_mode": {
297+
"name": "ipython",
298+
"version": 3
299+
},
300+
"file_extension": ".py",
301+
"mimetype": "text/x-python",
302+
"name": "python",
303+
"nbconvert_exporter": "python",
304+
"pygments_lexer": "ipython3",
305+
"version": "3.8.8"
306+
},
307+
"orig_nbformat": 2
308+
},
309+
"nbformat": 4,
310+
"nbformat_minor": 2
311+
}

0 commit comments

Comments
 (0)