|
52 | 52 | },
|
53 | 53 | {
|
54 | 54 | "cell_type": "code",
|
55 |
| - "execution_count": 7, |
| 55 | + "execution_count": null, |
56 | 56 | "id": "ac3e8958-5d9c-4e80-9a6f-fd343a3d4dd5",
|
57 | 57 | "metadata": {
|
58 | 58 | "tags": []
|
|
89 | 89 | "id": "2ba80b72-4efc-4369-9acc-525613671e7b",
|
90 | 90 | "metadata": {},
|
91 | 91 | "source": [
|
92 |
| - "On Avocado dataset (how original). If you cloned git repo, is in /data, else go Kaggle" |
| 92 | + "Predict average price, avocado dataset (how original). If you ggit cloned repo, is in /data, else go Kaggle" |
93 | 93 | ]
|
94 | 94 | },
|
95 | 95 | {
|
|
108 | 108 | },
|
109 | 109 | "source": [
|
110 | 110 | "*Quick desc / scope of dataset :* \n",
|
111 |
| - "No EDA, this exercise have been made a million times\n", |
| 111 | + "No EDA, this exercise have been made a million times \n", |
112 | 112 | "Years 2015 to 2018 \n",
|
113 | 113 | "Two avocado types : organic or conventional \n",
|
114 | 114 | "Region = region of consumption \n",
|
|
117 | 117 | },
|
118 | 118 | {
|
119 | 119 | "cell_type": "code",
|
120 |
| - "execution_count": 8, |
| 120 | + "execution_count": null, |
121 | 121 | "id": "888a85f7-5e40-4e90-8a35-3cb1435d1460",
|
122 | 122 | "metadata": {
|
123 | 123 | "tags": []
|
124 | 124 | },
|
125 |
| - "outputs": [ |
126 |
| - { |
127 |
| - "name": "stdout", |
128 |
| - "output_type": "stream", |
129 |
| - "text": [ |
130 |
| - "root\n", |
131 |
| - " |-- _c0: integer (nullable = true)\n", |
132 |
| - " |-- Date: timestamp (nullable = true)\n", |
133 |
| - " |-- AveragePrice: double (nullable = true)\n", |
134 |
| - " |-- Total Volume: double (nullable = true)\n", |
135 |
| - " |-- 4046: double (nullable = true)\n", |
136 |
| - " |-- 4225: double (nullable = true)\n", |
137 |
| - " |-- 4770: double (nullable = true)\n", |
138 |
| - " |-- Total Bags: double (nullable = true)\n", |
139 |
| - " |-- Small Bags: double (nullable = true)\n", |
140 |
| - " |-- Large Bags: double (nullable = true)\n", |
141 |
| - " |-- XLarge Bags: double (nullable = true)\n", |
142 |
| - " |-- type: string (nullable = true)\n", |
143 |
| - " |-- year: integer (nullable = true)\n", |
144 |
| - " |-- region: string (nullable = true)\n", |
145 |
| - "\n", |
146 |
| - "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
147 |
| - "|_c0| Date|AveragePrice|Total Volume| 4046| 4225| 4770|Total Bags|Small Bags|Large Bags|XLarge Bags| type|year|region|\n", |
148 |
| - "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
149 |
| - "| 0|2015-12-27 00:00:00| 1.33| 64236.62|1036.74| 54454.85|48.16| 8696.87| 8603.62| 93.25| 0.0|conventional|2015|Albany|\n", |
150 |
| - "| 1|2015-12-20 00:00:00| 1.35| 54876.98| 674.28| 44638.81|58.33| 9505.56| 9408.07| 97.49| 0.0|conventional|2015|Albany|\n", |
151 |
| - "| 2|2015-12-13 00:00:00| 0.93| 118220.22| 794.7|109149.67|130.5| 8145.35| 8042.21| 103.14| 0.0|conventional|2015|Albany|\n", |
152 |
| - "| 3|2015-12-06 00:00:00| 1.08| 78992.15| 1132.0| 71976.41|72.58| 5811.16| 5677.4| 133.76| 0.0|conventional|2015|Albany|\n", |
153 |
| - "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
154 |
| - "only showing top 4 rows\n", |
155 |
| - "\n" |
156 |
| - ] |
157 |
| - } |
158 |
| - ], |
| 125 | + "outputs": [], |
159 | 126 | "source": [
|
160 | 127 | "# Cache table/dataframe for re-usable table with .cache()\n",
|
161 | 128 | "# caching operation takes place only when a Spark action (count, show, take or write) is also performed on the same dataframe\n",
|
|
181 | 148 | },
|
182 | 149 | {
|
183 | 150 | "cell_type": "code",
|
184 |
| - "execution_count": 9, |
| 151 | + "execution_count": null, |
185 | 152 | "id": "e0068bc2-270c-4e43-beeb-082b404ce297",
|
186 | 153 | "metadata": {
|
187 | 154 | "tags": []
|
|
201 | 168 | "id": "b840a5b1-8bd7-4c73-a8c9-133e4983e8dd",
|
202 | 169 | "metadata": {},
|
203 | 170 | "source": [
|
204 |
| - "- Steps differs a bit from sklearn. Search for 'transformers' and 'estimators'\n", |
| 171 | + "- Steps differs a bit from sklearn. Search for Spark 'transformers' and 'estimators'\n", |
205 | 172 | "- No EDA, has been done a million times on this dataset. \n",
|
206 | 173 | "- Format data \n",
|
207 |
| - "-Feature creation from 'Date' : yy and mm \n", |
208 |
| - "-Drop columns : Total Bags, Total Volume (strong corr with respective subcategories) ; could also be done in pipeline tho ?\n", |
209 |
| - "- Pipeline (encode etc...) \n", |
210 |
| - "-One hot encoding categorical 'region' (before that, use StringIndexer) \n", |
211 |
| - "-Drop transformed columns: Date, region. Note : unlike scikit-learn col transf, pyspark adds new col when transforming \n", |
212 |
| - "- Consolidate all remaining features in a single vector using VectorAssembler\n", |
213 |
| - "- Scale numerical features using StandardScaler <- would be earlier in a sklearn pipeline\n", |
214 |
| - "- Predict" |
| 174 | + "-Feature creation from 'Date' & 'Year' : yy and mm \n", |
| 175 | + "-Optional : Drop columns : Total Bags, Total Volume (strong corr with respective subcategories) \n", |
| 176 | + "- Build Pipeline (encode etc...) \n", |
| 177 | + "-StringIndexer to convert categorical in caetgory indices \n", |
| 178 | + "-One hot encoding categorical 'region' \n", |
| 179 | + "-VectorAssembler, used encoded features into a single vector \n", |
| 180 | + "-StandardScaler on features vector <- would be earlier in sklearn pipeline \n", |
| 181 | + "-define regressor (here, randomForest) \n", |
| 182 | + "-build Pipeline()\n", |
| 183 | + "- Simple model, no cv/search param" |
215 | 184 | ]
|
216 | 185 | },
|
217 | 186 | {
|
|
226 | 195 | },
|
227 | 196 | {
|
228 | 197 | "cell_type": "code",
|
229 |
| - "execution_count": 10, |
| 198 | + "execution_count": null, |
230 | 199 | "id": "ea5b4865-062b-491a-bf10-1242d46d358c",
|
231 | 200 | "metadata": {},
|
232 |
| - "outputs": [ |
233 |
| - { |
234 |
| - "name": "stdout", |
235 |
| - "output_type": "stream", |
236 |
| - "text": [ |
237 |
| - "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
238 |
| - "|AveragePrice|Medium Size|Large Size|XLarge Size|Small Bags|Large Bags|XLarge Bags| type|year|region|Year Index|Month|\n", |
239 |
| - "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
240 |
| - "| 1.33| 1036.74| 54454.85| 48.16| 8603.62| 93.25| 0.0|conventional|2015|Albany| 15| 12|\n", |
241 |
| - "| 1.35| 674.28| 44638.81| 58.33| 9408.07| 97.49| 0.0|conventional|2015|Albany| 15| 12|\n", |
242 |
| - "| 0.93| 794.7| 109149.67| 130.5| 8042.21| 103.14| 0.0|conventional|2015|Albany| 15| 12|\n", |
243 |
| - "| 1.08| 1132.0| 71976.41| 72.58| 5677.4| 133.76| 0.0|conventional|2015|Albany| 15| 12|\n", |
244 |
| - "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
245 |
| - "only showing top 4 rows\n", |
246 |
| - "\n" |
247 |
| - ] |
248 |
| - } |
249 |
| - ], |
| 201 | + "outputs": [], |
250 | 202 | "source": [
|
251 | 203 | "# convert 'year' yyyy to yy (yyyy - 2000, since we have 2015-2018 values)\n",
|
252 | 204 | "df = df.withColumn('Year Index', col('Year') - 2000)\n",
|
|
276 | 228 | },
|
277 | 229 | {
|
278 | 230 | "cell_type": "code",
|
279 |
| - "execution_count": 14, |
| 231 | + "execution_count": null, |
280 | 232 | "id": "382272ea-07aa-43a4-af0f-681b332af34d",
|
281 | 233 | "metadata": {},
|
282 | 234 | "outputs": [],
|
|
330 | 282 | "id": "c3332499-66a1-4f79-be00-bcefcbda212a",
|
331 | 283 | "metadata": {},
|
332 | 284 | "source": [
|
333 |
| - "Crude attempt, no cv, some default rf parameters. \n", |
| 285 | + "Crude attempt, no cv, some arbitrary randomForest parameters. \n", |
334 | 286 | "For parameters tuning, look up for pyspark.ml.tuning / CrossValidator, ParamGridBuilder. Not used here"
|
335 | 287 | ]
|
336 | 288 | },
|
337 | 289 | {
|
338 | 290 | "cell_type": "code",
|
339 |
| - "execution_count": 18, |
| 291 | + "execution_count": null, |
340 | 292 | "id": "ae2ebec7-8379-45bd-b375-faac5c64824c",
|
341 | 293 | "metadata": {
|
342 | 294 | "tags": []
|
343 | 295 | },
|
344 |
| - "outputs": [ |
345 |
| - { |
346 |
| - "data": { |
347 |
| - "text/plain": [ |
348 |
| - "0.1975694758480664" |
349 |
| - ] |
350 |
| - }, |
351 |
| - "execution_count": 18, |
352 |
| - "metadata": {}, |
353 |
| - "output_type": "execute_result" |
354 |
| - } |
355 |
| - ], |
| 296 | + "outputs": [], |
356 | 297 | "source": [
|
357 | 298 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
|
358 | 299 | "\n",
|
|
365 | 306 | "\n",
|
366 | 307 | "# apply the model to the test set\n",
|
367 | 308 | "prediction = model.transform(test)\n",
|
368 |
| - "eval = RegressionEvaluator(predictionCol='prediction',\n", |
| 309 | + "eval_ = RegressionEvaluator(predictionCol='prediction',\n", |
369 | 310 | " labelCol='AveragePrice', metricName='rmse')\n",
|
370 | 311 | "\n",
|
371 |
| - "eval.evaluate(prediction)" |
| 312 | + "eval_.evaluate(prediction)" |
372 | 313 | ]
|
373 | 314 | },
|
374 | 315 | {
|
375 | 316 | "cell_type": "markdown",
|
376 | 317 | "id": "5a769698-04bc-4eda-9edc-63a4bfd11d25",
|
377 | 318 | "metadata": {},
|
378 | 319 | "source": [
|
379 |
| - "For reference, original article, using Linear regression + cv : rmse of .28" |
| 320 | + "For reference, original article, using Linear regression + cv/gridSearch : rmse of .28" |
380 | 321 | ]
|
381 | 322 | }
|
382 | 323 | ],
|
|
0 commit comments