Skip to content

Commit fbf242f

Browse files
committed
re run and small changes for clarity
1 parent b2ac2c4 commit fbf242f

File tree

1 file changed

+79
-10
lines changed

1 file changed

+79
-10
lines changed

spark-cluster.ipynb

Lines changed: 79 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
},
5353
{
5454
"cell_type": "code",
55-
"execution_count": null,
55+
"execution_count": 1,
5656
"id": "ac3e8958-5d9c-4e80-9a6f-fd343a3d4dd5",
5757
"metadata": {
5858
"tags": []
@@ -89,7 +89,7 @@
8989
"id": "2ba80b72-4efc-4369-9acc-525613671e7b",
9090
"metadata": {},
9191
"source": [
92-
"Predict average price, avocado dataset (how original). If you ggit cloned repo, is in /data, else go Kaggle"
92+
"Predict average price, avocado dataset (how original). If you git cloned repo, is in /data, else go Kaggle"
9393
]
9494
},
9595
{
@@ -117,12 +117,45 @@
117117
},
118118
{
119119
"cell_type": "code",
120-
"execution_count": null,
120+
"execution_count": 2,
121121
"id": "888a85f7-5e40-4e90-8a35-3cb1435d1460",
122122
"metadata": {
123123
"tags": []
124124
},
125-
"outputs": [],
125+
"outputs": [
126+
{
127+
"name": "stdout",
128+
"output_type": "stream",
129+
"text": [
130+
"root\n",
131+
" |-- _c0: integer (nullable = true)\n",
132+
" |-- Date: timestamp (nullable = true)\n",
133+
" |-- AveragePrice: double (nullable = true)\n",
134+
" |-- Total Volume: double (nullable = true)\n",
135+
" |-- 4046: double (nullable = true)\n",
136+
" |-- 4225: double (nullable = true)\n",
137+
" |-- 4770: double (nullable = true)\n",
138+
" |-- Total Bags: double (nullable = true)\n",
139+
" |-- Small Bags: double (nullable = true)\n",
140+
" |-- Large Bags: double (nullable = true)\n",
141+
" |-- XLarge Bags: double (nullable = true)\n",
142+
" |-- type: string (nullable = true)\n",
143+
" |-- year: integer (nullable = true)\n",
144+
" |-- region: string (nullable = true)\n",
145+
"\n",
146+
"+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n",
147+
"|_c0| Date|AveragePrice|Total Volume| 4046| 4225| 4770|Total Bags|Small Bags|Large Bags|XLarge Bags| type|year|region|\n",
148+
"+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n",
149+
"| 0|2015-12-27 00:00:00| 1.33| 64236.62|1036.74| 54454.85|48.16| 8696.87| 8603.62| 93.25| 0.0|conventional|2015|Albany|\n",
150+
"| 1|2015-12-20 00:00:00| 1.35| 54876.98| 674.28| 44638.81|58.33| 9505.56| 9408.07| 97.49| 0.0|conventional|2015|Albany|\n",
151+
"| 2|2015-12-13 00:00:00| 0.93| 118220.22| 794.7|109149.67|130.5| 8145.35| 8042.21| 103.14| 0.0|conventional|2015|Albany|\n",
152+
"| 3|2015-12-06 00:00:00| 1.08| 78992.15| 1132.0| 71976.41|72.58| 5811.16| 5677.4| 133.76| 0.0|conventional|2015|Albany|\n",
153+
"+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n",
154+
"only showing top 4 rows\n",
155+
"\n"
156+
]
157+
}
158+
],
126159
"source": [
127160
"# Cache table/dataframe for re-usable table with .cache()\n",
128161
"# caching operation takes place only when a Spark action (count, show, take or write) is also performed on the same dataframe\n",
@@ -148,7 +181,7 @@
148181
},
149182
{
150183
"cell_type": "code",
151-
"execution_count": null,
184+
"execution_count": 3,
152185
"id": "e0068bc2-270c-4e43-beeb-082b404ce297",
153186
"metadata": {
154187
"tags": []
@@ -195,10 +228,27 @@
195228
},
196229
{
197230
"cell_type": "code",
198-
"execution_count": null,
231+
"execution_count": 4,
199232
"id": "ea5b4865-062b-491a-bf10-1242d46d358c",
200233
"metadata": {},
201-
"outputs": [],
234+
"outputs": [
235+
{
236+
"name": "stdout",
237+
"output_type": "stream",
238+
"text": [
239+
"+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n",
240+
"|AveragePrice|Medium Size|Large Size|XLarge Size|Small Bags|Large Bags|XLarge Bags| type|year|region|Year Index|Month|\n",
241+
"+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n",
242+
"| 1.33| 1036.74| 54454.85| 48.16| 8603.62| 93.25| 0.0|conventional|2015|Albany| 15| 12|\n",
243+
"| 1.35| 674.28| 44638.81| 58.33| 9408.07| 97.49| 0.0|conventional|2015|Albany| 15| 12|\n",
244+
"| 0.93| 794.7| 109149.67| 130.5| 8042.21| 103.14| 0.0|conventional|2015|Albany| 15| 12|\n",
245+
"| 1.08| 1132.0| 71976.41| 72.58| 5677.4| 133.76| 0.0|conventional|2015|Albany| 15| 12|\n",
246+
"+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n",
247+
"only showing top 4 rows\n",
248+
"\n"
249+
]
250+
}
251+
],
202252
"source": [
203253
"# convert 'year' yyyy to yy (yyyy - 2000, since we have 2015-2018 values)\n",
204254
"df = df.withColumn('Year Index', col('Year') - 2000)\n",
@@ -228,7 +278,7 @@
228278
},
229279
{
230280
"cell_type": "code",
231-
"execution_count": null,
281+
"execution_count": 5,
232282
"id": "382272ea-07aa-43a4-af0f-681b332af34d",
233283
"metadata": {},
234284
"outputs": [],
@@ -288,12 +338,23 @@
288338
},
289339
{
290340
"cell_type": "code",
291-
"execution_count": null,
341+
"execution_count": 6,
292342
"id": "ae2ebec7-8379-45bd-b375-faac5c64824c",
293343
"metadata": {
294344
"tags": []
295345
},
296-
"outputs": [],
346+
"outputs": [
347+
{
348+
"data": {
349+
"text/plain": [
350+
"0.18867989356762913"
351+
]
352+
},
353+
"execution_count": 6,
354+
"metadata": {},
355+
"output_type": "execute_result"
356+
}
357+
],
297358
"source": [
298359
"from pyspark.ml.evaluation import RegressionEvaluator\n",
299360
"\n",
@@ -319,6 +380,14 @@
319380
"source": [
320381
"For reference, original article, using Linear regression + cv/gridSearch : rmse of .28"
321382
]
383+
},
384+
{
385+
"cell_type": "code",
386+
"execution_count": null,
387+
"id": "311fe041-88b7-4e7f-a495-46dcc846bc50",
388+
"metadata": {},
389+
"outputs": [],
390+
"source": []
322391
}
323392
],
324393
"metadata": {

0 commit comments

Comments
 (0)