|
52 | 52 | },
|
53 | 53 | {
|
54 | 54 | "cell_type": "code",
|
55 |
| - "execution_count": null, |
| 55 | + "execution_count": 1, |
56 | 56 | "id": "ac3e8958-5d9c-4e80-9a6f-fd343a3d4dd5",
|
57 | 57 | "metadata": {
|
58 | 58 | "tags": []
|
|
89 | 89 | "id": "2ba80b72-4efc-4369-9acc-525613671e7b",
|
90 | 90 | "metadata": {},
|
91 | 91 | "source": [
|
92 |
| - "Predict average price, avocado dataset (how original). If you ggit cloned repo, is in /data, else go Kaggle" |
| 92 | + "Predict average price, avocado dataset (how original). If you git cloned repo, is in /data, else go Kaggle" |
93 | 93 | ]
|
94 | 94 | },
|
95 | 95 | {
|
|
117 | 117 | },
|
118 | 118 | {
|
119 | 119 | "cell_type": "code",
|
120 |
| - "execution_count": null, |
| 120 | + "execution_count": 2, |
121 | 121 | "id": "888a85f7-5e40-4e90-8a35-3cb1435d1460",
|
122 | 122 | "metadata": {
|
123 | 123 | "tags": []
|
124 | 124 | },
|
125 |
| - "outputs": [], |
| 125 | + "outputs": [ |
| 126 | + { |
| 127 | + "name": "stdout", |
| 128 | + "output_type": "stream", |
| 129 | + "text": [ |
| 130 | + "root\n", |
| 131 | + " |-- _c0: integer (nullable = true)\n", |
| 132 | + " |-- Date: timestamp (nullable = true)\n", |
| 133 | + " |-- AveragePrice: double (nullable = true)\n", |
| 134 | + " |-- Total Volume: double (nullable = true)\n", |
| 135 | + " |-- 4046: double (nullable = true)\n", |
| 136 | + " |-- 4225: double (nullable = true)\n", |
| 137 | + " |-- 4770: double (nullable = true)\n", |
| 138 | + " |-- Total Bags: double (nullable = true)\n", |
| 139 | + " |-- Small Bags: double (nullable = true)\n", |
| 140 | + " |-- Large Bags: double (nullable = true)\n", |
| 141 | + " |-- XLarge Bags: double (nullable = true)\n", |
| 142 | + " |-- type: string (nullable = true)\n", |
| 143 | + " |-- year: integer (nullable = true)\n", |
| 144 | + " |-- region: string (nullable = true)\n", |
| 145 | + "\n", |
| 146 | + "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
| 147 | + "|_c0| Date|AveragePrice|Total Volume| 4046| 4225| 4770|Total Bags|Small Bags|Large Bags|XLarge Bags| type|year|region|\n", |
| 148 | + "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
| 149 | + "| 0|2015-12-27 00:00:00| 1.33| 64236.62|1036.74| 54454.85|48.16| 8696.87| 8603.62| 93.25| 0.0|conventional|2015|Albany|\n", |
| 150 | + "| 1|2015-12-20 00:00:00| 1.35| 54876.98| 674.28| 44638.81|58.33| 9505.56| 9408.07| 97.49| 0.0|conventional|2015|Albany|\n", |
| 151 | + "| 2|2015-12-13 00:00:00| 0.93| 118220.22| 794.7|109149.67|130.5| 8145.35| 8042.21| 103.14| 0.0|conventional|2015|Albany|\n", |
| 152 | + "| 3|2015-12-06 00:00:00| 1.08| 78992.15| 1132.0| 71976.41|72.58| 5811.16| 5677.4| 133.76| 0.0|conventional|2015|Albany|\n", |
| 153 | + "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
| 154 | + "only showing top 4 rows\n", |
| 155 | + "\n" |
| 156 | + ] |
| 157 | + } |
| 158 | + ], |
126 | 159 | "source": [
|
127 | 160 | "# Cache table/dataframe for re-usable table with .cache()\n",
|
128 | 161 | "# caching operation takes place only when a Spark action (count, show, take or write) is also performed on the same dataframe\n",
|
|
148 | 181 | },
|
149 | 182 | {
|
150 | 183 | "cell_type": "code",
|
151 |
| - "execution_count": null, |
| 184 | + "execution_count": 3, |
152 | 185 | "id": "e0068bc2-270c-4e43-beeb-082b404ce297",
|
153 | 186 | "metadata": {
|
154 | 187 | "tags": []
|
|
195 | 228 | },
|
196 | 229 | {
|
197 | 230 | "cell_type": "code",
|
198 |
| - "execution_count": null, |
| 231 | + "execution_count": 4, |
199 | 232 | "id": "ea5b4865-062b-491a-bf10-1242d46d358c",
|
200 | 233 | "metadata": {},
|
201 |
| - "outputs": [], |
| 234 | + "outputs": [ |
| 235 | + { |
| 236 | + "name": "stdout", |
| 237 | + "output_type": "stream", |
| 238 | + "text": [ |
| 239 | + "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
| 240 | + "|AveragePrice|Medium Size|Large Size|XLarge Size|Small Bags|Large Bags|XLarge Bags| type|year|region|Year Index|Month|\n", |
| 241 | + "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
| 242 | + "| 1.33| 1036.74| 54454.85| 48.16| 8603.62| 93.25| 0.0|conventional|2015|Albany| 15| 12|\n", |
| 243 | + "| 1.35| 674.28| 44638.81| 58.33| 9408.07| 97.49| 0.0|conventional|2015|Albany| 15| 12|\n", |
| 244 | + "| 0.93| 794.7| 109149.67| 130.5| 8042.21| 103.14| 0.0|conventional|2015|Albany| 15| 12|\n", |
| 245 | + "| 1.08| 1132.0| 71976.41| 72.58| 5677.4| 133.76| 0.0|conventional|2015|Albany| 15| 12|\n", |
| 246 | + "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
| 247 | + "only showing top 4 rows\n", |
| 248 | + "\n" |
| 249 | + ] |
| 250 | + } |
| 251 | + ], |
202 | 252 | "source": [
|
203 | 253 | "# convert 'year' yyyy to yy (yyyy - 2000, since we have 2015-2018 values)\n",
|
204 | 254 | "df = df.withColumn('Year Index', col('Year') - 2000)\n",
|
|
228 | 278 | },
|
229 | 279 | {
|
230 | 280 | "cell_type": "code",
|
231 |
| - "execution_count": null, |
| 281 | + "execution_count": 5, |
232 | 282 | "id": "382272ea-07aa-43a4-af0f-681b332af34d",
|
233 | 283 | "metadata": {},
|
234 | 284 | "outputs": [],
|
|
288 | 338 | },
|
289 | 339 | {
|
290 | 340 | "cell_type": "code",
|
291 |
| - "execution_count": null, |
| 341 | + "execution_count": 6, |
292 | 342 | "id": "ae2ebec7-8379-45bd-b375-faac5c64824c",
|
293 | 343 | "metadata": {
|
294 | 344 | "tags": []
|
295 | 345 | },
|
296 |
| - "outputs": [], |
| 346 | + "outputs": [ |
| 347 | + { |
| 348 | + "data": { |
| 349 | + "text/plain": [ |
| 350 | + "0.18867989356762913" |
| 351 | + ] |
| 352 | + }, |
| 353 | + "execution_count": 6, |
| 354 | + "metadata": {}, |
| 355 | + "output_type": "execute_result" |
| 356 | + } |
| 357 | + ], |
297 | 358 | "source": [
|
298 | 359 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
|
299 | 360 | "\n",
|
|
319 | 380 | "source": [
|
320 | 381 | "For reference, original article, using Linear regression + cv/gridSearch : rmse of .28"
|
321 | 382 | ]
|
| 383 | + }, |
| 384 | + { |
| 385 | + "cell_type": "code", |
| 386 | + "execution_count": null, |
| 387 | + "id": "311fe041-88b7-4e7f-a495-46dcc846bc50", |
| 388 | + "metadata": {}, |
| 389 | + "outputs": [], |
| 390 | + "source": [] |
322 | 391 | }
|
323 | 392 | ],
|
324 | 393 | "metadata": {
|
|
0 commit comments