|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "markdown",
|
5 |
| - "source": [], |
| 5 | + "source": [ |
| 6 | + "# Analyzing Data\r\n", |
| 7 | + "Examples of the Pandas functions mentioned in the [lesson](README.md)." |
| 8 | + ], |
6 | 9 | "metadata": {}
|
7 | 10 | },
|
8 | 11 | {
|
9 | 12 | "cell_type": "code",
|
10 |
| - "execution_count": null, |
| 13 | + "execution_count": 1, |
11 | 14 | "source": [
|
12 | 15 | "import pandas as pd\r\n",
|
13 | 16 | "import glob\r\n",
|
|
21 | 24 | },
|
22 | 25 | {
|
23 | 26 | "cell_type": "code",
|
24 |
| - "execution_count": null, |
| 27 | + "execution_count": 2, |
25 | 28 | "source": [
|
26 | 29 | "# Using Describe on the email dataset\r\n",
|
27 | 30 | "print(email_df.describe())"
|
28 | 31 | ],
|
29 |
| - "outputs": [], |
| 32 | + "outputs": [ |
| 33 | + { |
| 34 | + "output_type": "stream", |
| 35 | + "name": "stdout", |
| 36 | + "text": [ |
| 37 | + " the to ect and for of \\\n", |
| 38 | + "count 406.000000 406.000000 406.000000 406.000000 406.000000 406.000000 \n", |
| 39 | + "mean 7.022167 6.519704 4.948276 3.059113 3.502463 2.662562 \n", |
| 40 | + "std 10.945522 9.801907 9.293820 6.267806 4.901372 5.443939 \n", |
| 41 | + "min 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 \n", |
| 42 | + "25% 1.000000 1.000000 1.000000 0.000000 1.000000 0.000000 \n", |
| 43 | + "50% 3.000000 3.000000 2.000000 1.000000 2.000000 1.000000 \n", |
| 44 | + "75% 9.000000 7.750000 4.000000 3.000000 4.750000 3.000000 \n", |
| 45 | + "max 99.000000 88.000000 79.000000 69.000000 39.000000 57.000000 \n", |
| 46 | + "\n", |
| 47 | + " a you in on is this \\\n", |
| 48 | + "count 406.000000 406.000000 406.000000 406.000000 406.000000 406.000000 \n", |
| 49 | + "mean 57.017241 2.394089 10.817734 11.591133 5.901478 1.485222 \n", |
| 50 | + "std 78.868243 4.067015 19.050972 16.407175 8.793103 2.912473 \n", |
| 51 | + "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", |
| 52 | + "25% 15.000000 0.000000 1.250000 3.000000 1.000000 0.000000 \n", |
| 53 | + "50% 29.000000 1.000000 5.000000 6.000000 3.000000 0.000000 \n", |
| 54 | + "75% 61.000000 3.000000 12.000000 13.000000 7.000000 2.000000 \n", |
| 55 | + "max 843.000000 31.000000 223.000000 125.000000 61.000000 24.000000 \n", |
| 56 | + "\n", |
| 57 | + " i be that will \n", |
| 58 | + "count 406.000000 406.000000 406.000000 406.000000 \n", |
| 59 | + "mean 47.155172 2.950739 1.034483 0.955665 \n", |
| 60 | + "std 71.043009 4.297865 1.904846 2.042271 \n", |
| 61 | + "min 0.000000 0.000000 0.000000 0.000000 \n", |
| 62 | + "25% 11.000000 1.000000 0.000000 0.000000 \n", |
| 63 | + "50% 24.000000 1.000000 0.000000 0.000000 \n", |
| 64 | + "75% 50.750000 3.000000 1.000000 1.000000 \n", |
| 65 | + "max 754.000000 40.000000 14.000000 24.000000 \n" |
| 66 | + ] |
| 67 | + } |
| 68 | + ], |
30 | 69 | "metadata": {}
|
31 | 70 | },
|
32 | 71 | {
|
33 |
| - "cell_type": "markdown", |
34 |
| - "source": [], |
| 72 | + "cell_type": "code", |
| 73 | + "execution_count": 5, |
| 74 | + "source": [ |
| 75 | + "# Sampling 10 emails\r\n", |
| 76 | + "print(email_df.sample(10))" |
| 77 | + ], |
| 78 | + "outputs": [ |
| 79 | + { |
| 80 | + "output_type": "stream", |
| 81 | + "name": "stdout", |
| 82 | + "text": [ |
| 83 | + " Email No. the to ect and for of a you in on is this i \\\n", |
| 84 | + "150 Email 151 0 1 2 0 3 0 15 0 0 5 0 0 7 \n", |
| 85 | + "380 Email 5147 0 3 2 0 0 0 7 0 1 1 0 0 3 \n", |
| 86 | + "19 Email 20 3 4 11 0 4 2 32 1 1 3 9 5 25 \n", |
| 87 | + "300 Email 301 2 1 1 0 1 1 15 2 2 3 2 0 8 \n", |
| 88 | + "307 Email 308 0 0 1 0 0 0 1 0 1 0 0 0 2 \n", |
| 89 | + "167 Email 168 2 2 2 1 5 1 24 2 5 6 4 0 30 \n", |
| 90 | + "320 Email 321 10 12 4 6 8 6 187 5 26 28 23 2 171 \n", |
| 91 | + "61 Email 62 0 1 1 0 4 1 15 4 4 3 3 0 19 \n", |
| 92 | + "26 Email 27 5 4 1 1 4 4 51 0 8 6 6 2 44 \n", |
| 93 | + "73 Email 74 0 0 1 0 0 0 7 0 4 3 0 0 6 \n", |
| 94 | + "\n", |
| 95 | + " be that will \n", |
| 96 | + "150 1 0 0 \n", |
| 97 | + "380 0 0 0 \n", |
| 98 | + "19 3 0 1 \n", |
| 99 | + "300 0 0 0 \n", |
| 100 | + "307 0 0 0 \n", |
| 101 | + "167 2 0 0 \n", |
| 102 | + "320 5 1 1 \n", |
| 103 | + "61 2 0 0 \n", |
| 104 | + "26 6 0 0 \n", |
| 105 | + "73 0 0 0 \n" |
| 106 | + ] |
| 107 | + } |
| 108 | + ], |
| 109 | + "metadata": {} |
| 110 | + }, |
| 111 | + { |
| 112 | + "cell_type": "code", |
| 113 | + "execution_count": 14, |
| 114 | + "source": [ |
| 115 | + "# Returns rows where there are more occurrences of \"to\" than \"the\"\r\n", |
| 116 | + "print(email_df.query('the < to'))" |
| 117 | + ], |
| 118 | + "outputs": [ |
| 119 | + { |
| 120 | + "output_type": "stream", |
| 121 | + "name": "stdout", |
| 122 | + "text": [ |
| 123 | + " Email No. the to ect and for of a you in on is this i \\\n", |
| 124 | + "1 Email 2 8 13 24 6 6 2 102 1 18 21 13 0 61 \n", |
| 125 | + "3 Email 4 0 5 22 0 5 1 51 2 1 5 9 2 16 \n", |
| 126 | + "5 Email 6 4 5 1 4 2 3 45 1 16 12 8 1 52 \n", |
| 127 | + "7 Email 8 0 2 2 3 1 2 21 6 2 6 2 0 28 \n", |
| 128 | + "13 Email 14 4 5 7 1 5 1 37 1 8 8 6 1 43 \n", |
| 129 | + ".. ... ... .. ... ... ... .. ... ... .. .. .. ... .. \n", |
| 130 | + "390 Email 5157 4 13 1 0 3 1 48 2 8 26 9 1 45 \n", |
| 131 | + "393 Email 5160 2 13 1 0 2 1 38 2 7 24 6 1 34 \n", |
| 132 | + "396 Email 5163 2 3 1 2 1 2 32 0 7 3 2 0 26 \n", |
| 133 | + "404 Email 5171 2 7 1 0 2 1 28 2 8 11 7 1 39 \n", |
| 134 | + "405 Email 5172 22 24 5 1 6 5 148 8 23 13 5 4 99 \n", |
| 135 | + "\n", |
| 136 | + " be that will \n", |
| 137 | + "1 4 2 0 \n", |
| 138 | + "3 2 0 0 \n", |
| 139 | + "5 2 0 0 \n", |
| 140 | + "7 1 0 1 \n", |
| 141 | + "13 1 0 1 \n", |
| 142 | + ".. .. ... ... \n", |
| 143 | + "390 1 0 0 \n", |
| 144 | + "393 1 0 0 \n", |
| 145 | + "396 3 0 0 \n", |
| 146 | + "404 1 0 0 \n", |
| 147 | + "405 6 4 1 \n", |
| 148 | + "\n", |
| 149 | + "[169 rows x 17 columns]\n" |
| 150 | + ] |
| 151 | + } |
| 152 | + ], |
35 | 153 | "metadata": {}
|
36 | 154 | }
|
37 | 155 | ],
|
|
0 commit comments