1
+ {
2
+ "cells" : [
3
+ {
4
+ "cell_type" : " markdown" ,
5
+ "metadata" : {},
6
+ "source" : [
7
+ " # Grouping and Categorical Data Type"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type" : " code" ,
12
+ "execution_count" : 40 ,
13
+ "metadata" : {},
14
+ "outputs" : [],
15
+ "source" : [
16
+ " import numpy as np\n " ,
17
+ " import pandas as pd"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type" : " markdown" ,
22
+ "metadata" : {},
23
+ "source" : [
24
+ " ## Grouping"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type" : " code" ,
29
+ "execution_count" : 41 ,
30
+ "metadata" : {},
31
+ "outputs" : [
32
+ {
33
+ "data" : {
34
+ "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>A</th>\n <th>B</th>\n <th>C</th>\n <th>D</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>foo</td>\n <td>1</td>\n <td>0.773956</td>\n <td>0.652299</td>\n </tr>\n <tr>\n <th>1</th>\n <td>foo</td>\n <td>2</td>\n <td>0.438878</td>\n <td>0.043775</td>\n </tr>\n <tr>\n <th>2</th>\n <td>foo</td>\n <td>1</td>\n <td>0.858598</td>\n <td>0.020030</td>\n </tr>\n <tr>\n <th>3</th>\n <td>foo</td>\n <td>2</td>\n <td>0.697368</td>\n <td>0.839213</td>\n </tr>\n <tr>\n <th>4</th>\n <td>bar</td>\n <td>1</td>\n <td>0.094177</td>\n <td>0.587143</td>\n </tr>\n <tr>\n <th>5</th>\n <td>bar</td>\n <td>2</td>\n <td>0.975622</td>\n <td>0.224705</td>\n </tr>\n <tr>\n <th>6</th>\n <td>bar</td>\n <td>3</td>\n <td>0.761140</td>\n <td>0.751792</td>\n </tr>\n <tr>\n <th>7</th>\n <td>bar</td>\n <td>2</td>\n <td>0.786064</td>\n <td>0.263692</td>\n </tr>\n </tbody>\n</table>\n</div>",
35
+ "text/plain" : " A B C D\n 0 foo 1 0.773956 0.652299\n 1 foo 2 0.438878 0.043775\n 2 foo 1 0.858598 0.020030\n 3 foo 2 0.697368 0.839213\n 4 bar 1 0.094177 0.587143\n 5 bar 2 0.975622 0.224705\n 6 bar 3 0.761140 0.751792\n 7 bar 2 0.786064 0.263692"
36
+ },
37
+ "execution_count" : 41 ,
38
+ "metadata" : {},
39
+ "output_type" : " execute_result"
40
+ }
41
+ ],
42
+ "source" : [
43
+ " df = pd.DataFrame(\n " ,
44
+ " {\n " ,
45
+ " \" A\" : [\" foo\" ] * 4 + [\" bar\" ] * 4,\n " ,
46
+ " \" B\" : [1, 2, 1, 2, 1, 2, 3, 2],\n " ,
47
+ " \" C\" : np.random.default_rng(42).random(8),\n " ,
48
+ " \" D\" : np.random.default_rng(43).random(8),\n " ,
49
+ " }\n " ,
50
+ " )\n " ,
51
+ " df"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type" : " code" ,
56
+ "execution_count" : 42 ,
57
+ "metadata" : {},
58
+ "outputs" : [
59
+ {
60
+ "data" : {
61
+ "text/html" : " <div>\n <style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n </style>\n <table border=\" 1\" class=\" dataframe\" >\n <thead>\n <tr style=\" text-align: right;\" >\n <th></th>\n <th>B</th>\n <th>C</th>\n <th>D</th>\n </tr>\n <tr>\n <th>A</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>bar</th>\n <td>8</td>\n <td>2.617004</td>\n <td>1.827333</td>\n </tr>\n <tr>\n <th>foo</th>\n <td>6</td>\n <td>2.768800</td>\n <td>1.555317</td>\n </tr>\n </tbody>\n </table>\n </div>" ,
62
+ "text/plain" : " B C D\n A \n bar 8 2.617004 1.827333\n foo 6 2.768800 1.555317"
63
+ },
64
+ "execution_count" : 42 ,
65
+ "metadata" : {},
66
+ "output_type" : " execute_result"
67
+ }
68
+ ],
69
+ "source" : [
70
+ " df.groupby(\" A\" ).sum()"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type" : " code" ,
75
+ "execution_count" : 43 ,
76
+ "metadata" : {},
77
+ "outputs" : [
78
+ {
79
+ "data" : {
80
+ "text/html" : " <div>\n <style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n </style>\n <table border=\" 1\" class=\" dataframe\" >\n <thead>\n <tr style=\" text-align: right;\" >\n <th></th>\n <th>B</th>\n <th>C</th>\n <th>D</th>\n </tr>\n <tr>\n <th>A</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>bar</th>\n <td>3</td>\n <td>0.975622</td>\n <td>0.751792</td>\n </tr>\n <tr>\n <th>foo</th>\n <td>2</td>\n <td>0.858598</td>\n <td>0.839213</td>\n </tr>\n </tbody>\n </table>\n </div>" ,
81
+ "text/plain" : " B C D\n A \n bar 3 0.975622 0.751792\n foo 2 0.858598 0.839213"
82
+ },
83
+ "execution_count" : 43 ,
84
+ "metadata" : {},
85
+ "output_type" : " execute_result"
86
+ }
87
+ ],
88
+ "source" : [
89
+ " df.groupby(\" A\" ).max()"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type" : " code" ,
94
+ "execution_count" : 44 ,
95
+ "metadata" : {},
96
+ "outputs" : [
97
+ {
98
+ "data" : {
99
+ "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th></th>\n <th>C</th>\n <th>D</th>\n </tr>\n <tr>\n <th>A</th>\n <th>B</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th rowspan=\"3\" valign=\"top\">bar</th>\n <th>1</th>\n <td>0.094177</td>\n <td>0.587143</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.761687</td>\n <td>0.488397</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.761140</td>\n <td>0.751792</td>\n </tr>\n <tr>\n <th rowspan=\"2\" valign=\"top\">foo</th>\n <th>1</th>\n <td>1.632554</td>\n <td>0.672329</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1.136246</td>\n <td>0.882988</td>\n </tr>\n </tbody>\n</table>\n</div>",
100
+ "text/plain" : " C D\n A B \n bar 1 0.094177 0.587143\n 2 1.761687 0.488397\n 3 0.761140 0.751792\n foo 1 1.632554 0.672329\n 2 1.136246 0.882988"
101
+ },
102
+ "execution_count" : 44 ,
103
+ "metadata" : {},
104
+ "output_type" : " execute_result"
105
+ }
106
+ ],
107
+ "source" : [
108
+ " df.groupby([\" A\" , \" B\" ]).sum()"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type" : " markdown" ,
113
+ "metadata" : {},
114
+ "source" : [
115
+ " ## Categorical Data Type"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type" : " markdown" ,
120
+ "metadata" : {},
121
+ "source" : [
122
+ " ### Discrete Values"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type" : " code" ,
127
+ "execution_count" : 45 ,
128
+ "metadata" : {},
129
+ "outputs" : [
130
+ {
131
+ "data" : {
132
+ "text/html" : " <div>\n <style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n </style>\n <table border=\" 1\" class=\" dataframe\" >\n <thead>\n <tr style=\" text-align: right;\" >\n <th></th>\n <th>id</th>\n <th>grade</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>a</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>b</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>e</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>a</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>a</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>c</td>\n </tr>\n </tbody>\n </table>\n </div>" ,
133
+ "text/plain" : " id grade\n 0 1 a\n 1 2 b\n 2 3 e\n 3 4 a\n 4 5 a\n 5 6 c"
134
+ },
135
+ "execution_count" : 45 ,
136
+ "metadata" : {},
137
+ "output_type" : " execute_result"
138
+ }
139
+ ],
140
+ "source" : [
141
+ " df = pd.DataFrame(\r\n " ,
142
+ " {\" id\" : [1, 2, 3, 4, 5, 6], \r\n " ,
143
+ " \" grade\" : [\" a\" , \" b\" , \" e\" , \" a\" , \" a\" , \" c\" ]}\r\n " ,
144
+ " )\r\n " ,
145
+ " \r\n " ,
146
+ " df[\" grade\" ] = df[\" grade\" ].astype(\" category\" )\r\n " ,
147
+ " df"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type" : " code" ,
152
+ "execution_count" : 46 ,
153
+ "metadata" : {},
154
+ "outputs" : [
155
+ {
156
+ "data" : {
157
+ "text/plain" : " 0 a\n 1 b\n 2 e\n 3 a\n 4 a\n 5 c\n Name: grade, dtype: category\n Categories (4, object): ['a', 'b', 'c', 'e']"
158
+ },
159
+ "execution_count" : 46 ,
160
+ "metadata" : {},
161
+ "output_type" : " execute_result"
162
+ }
163
+ ],
164
+ "source" : [
165
+ " df[\" grade\" ]"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type" : " code" ,
170
+ "execution_count" : 47 ,
171
+ "metadata" : {},
172
+ "outputs" : [
173
+ {
174
+ "data" : {
175
+ "text/html" : " <div>\n <style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n </style>\n <table border=\" 1\" class=\" dataframe\" >\n <thead>\n <tr style=\" text-align: right;\" >\n <th></th>\n <th>id</th>\n <th>grade</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>4</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>3</td>\n </tr>\n </tbody>\n </table>\n </div>" ,
176
+ "text/plain" : " id grade\n 0 1 1\n 1 2 2\n 2 3 4\n 3 4 1\n 4 5 1\n 5 6 3"
177
+ },
178
+ "execution_count" : 47 ,
179
+ "metadata" : {},
180
+ "output_type" : " execute_result"
181
+ }
182
+ ],
183
+ "source" : [
184
+ " df[\" grade\" ].cat.categories = [1, 2, 3, 4]\r\n " ,
185
+ " df"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type" : " code" ,
190
+ "execution_count" : 48 ,
191
+ "metadata" : {},
192
+ "outputs" : [
193
+ {
194
+ "data" : {
195
+ "text/html" : " <div>\n <style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n </style>\n <table border=\" 1\" class=\" dataframe\" >\n <thead>\n <tr style=\" text-align: right;\" >\n <th></th>\n <th>id</th>\n <th>grade</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>2</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>3</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>4</td>\n </tr>\n </tbody>\n </table>\n </div>" ,
196
+ "text/plain" : " id grade\n 0 1 1\n 3 4 1\n 4 5 1\n 1 2 2\n 5 6 3\n 2 3 4"
197
+ },
198
+ "execution_count" : 48 ,
199
+ "metadata" : {},
200
+ "output_type" : " execute_result"
201
+ }
202
+ ],
203
+ "source" : [
204
+ " df.sort_values(by=\" grade\" )"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type" : " code" ,
209
+ "execution_count" : 49 ,
210
+ "metadata" : {},
211
+ "outputs" : [
212
+ {
213
+ "data" : {
214
+ "text/plain" : " grade\n 1 3\n 2 1\n 3 1\n 4 1\n dtype: int64"
215
+ },
216
+ "execution_count" : 49 ,
217
+ "metadata" : {},
218
+ "output_type" : " execute_result"
219
+ }
220
+ ],
221
+ "source" : [
222
+ " df.groupby(\" grade\" ).size()"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type" : " markdown" ,
227
+ "metadata" : {},
228
+ "source" : [
229
+ " ### Continuous Values"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type" : " code" ,
234
+ "execution_count" : 50 ,
235
+ "metadata" : {},
236
+ "outputs" : [
237
+ {
238
+ "data" : {
239
+ "text/html" : " <div>\n <style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n </style>\n <table border=\" 1\" class=\" dataframe\" >\n <thead>\n <tr style=\" text-align: right;\" >\n <th></th>\n <th>score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>8</td>\n </tr>\n <tr>\n <th>1</th>\n <td>77</td>\n </tr>\n <tr>\n <th>2</th>\n <td>65</td>\n </tr>\n <tr>\n <th>3</th>\n <td>43</td>\n </tr>\n <tr>\n <th>4</th>\n <td>43</td>\n </tr>\n <tr>\n <th>5</th>\n <td>85</td>\n </tr>\n <tr>\n <th>6</th>\n <td>8</td>\n </tr>\n <tr>\n <th>7</th>\n <td>69</td>\n </tr>\n </tbody>\n </table>\n </div>" ,
240
+ "text/plain" : " score\n 0 8\n 1 77\n 2 65\n 3 43\n 4 43\n 5 85\n 6 8\n 7 69"
241
+ },
242
+ "execution_count" : 50 ,
243
+ "metadata" : {},
244
+ "output_type" : " execute_result"
245
+ }
246
+ ],
247
+ "source" : [
248
+ " df = pd.DataFrame({\r\n " ,
249
+ " \" score\" : np.random.default_rng(42).integers(0, 100, 8)\r\n " ,
250
+ " })\r\n " ,
251
+ " df"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type" : " code" ,
256
+ "execution_count" : 51 ,
257
+ "metadata" : {},
258
+ "outputs" : [
259
+ {
260
+ "data" : {
261
+ "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>score</th>\n <th>grade-nolabels</th>\n <th>grade-labels</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>8</td>\n <td>(0, 20]</td>\n <td>E</td>\n </tr>\n <tr>\n <th>1</th>\n <td>77</td>\n <td>(60, 80]</td>\n <td>B</td>\n </tr>\n <tr>\n <th>2</th>\n <td>65</td>\n <td>(60, 80]</td>\n <td>B</td>\n </tr>\n <tr>\n <th>3</th>\n <td>43</td>\n <td>(40, 60]</td>\n <td>C</td>\n </tr>\n <tr>\n <th>4</th>\n <td>43</td>\n <td>(40, 60]</td>\n <td>C</td>\n </tr>\n <tr>\n <th>5</th>\n <td>85</td>\n <td>(80, 100]</td>\n <td>A</td>\n </tr>\n <tr>\n <th>6</th>\n <td>8</td>\n <td>(0, 20]</td>\n <td>E</td>\n </tr>\n <tr>\n <th>7</th>\n <td>69</td>\n <td>(60, 80]</td>\n <td>B</td>\n </tr>\n </tbody>\n</table>\n</div>",
262
+ "text/plain" : " score grade-nolabels grade-labels\n 0 8 (0, 20] E\n 1 77 (60, 80] B\n 2 65 (60, 80] B\n 3 43 (40, 60] C\n 4 43 (40, 60] C\n 5 85 (80, 100] A\n 6 8 (0, 20] E\n 7 69 (60, 80] B"
263
+ },
264
+ "execution_count" : 51 ,
265
+ "metadata" : {},
266
+ "output_type" : " execute_result"
267
+ }
268
+ ],
269
+ "source" : [
270
+ " labels = list(\" EDCBA\" ) # divide score into five levels (0-20), (20-40), (40-60), (60-80), (80-100)\r\n " ,
271
+ " df[\" grade-nolabels\" ] = pd.cut(df[\" score\" ], bins=range(0, 120, 20))\r\n " ,
272
+ " df[\" grade-labels\" ] = pd.cut(df[\" score\" ], bins=range(0, 120, 20), labels=list(\" EDCBA\" ))\r\n " ,
273
+ " df"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type" : " markdown" ,
278
+ "metadata" : {},
279
+ "source" : [
280
+ " # Reference\r\n " ,
281
+ " \r\n " ,
282
+ " - https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#grouping\r\n " ,
283
+ " - https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#categoricals\r\n " ,
284
+ " - https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html\r\n " ,
285
+ " - https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html"
286
+ ]
287
+ }
288
+ ],
289
+ "metadata" : {
290
+ "kernelspec" : {
291
+ "display_name" : " Python 3" ,
292
+ "language" : " python" ,
293
+ "name" : " python3"
294
+ },
295
+ "language_info" : {
296
+ "codemirror_mode" : {
297
+ "name" : " ipython" ,
298
+ "version" : 3
299
+ },
300
+ "file_extension" : " .py" ,
301
+ "mimetype" : " text/x-python" ,
302
+ "name" : " python" ,
303
+ "nbconvert_exporter" : " python" ,
304
+ "pygments_lexer" : " ipython3" ,
305
+ "version" : " 3.8.8"
306
+ },
307
+ "orig_nbformat" : 2
308
+ },
309
+ "nbformat" : 4 ,
310
+ "nbformat_minor" : 2
311
+ }
0 commit comments