re run and small changes for clarity

matthieuvion · matthieuvion · commit fbf242f2d624 · 2023-07-10T15:48:53.000+02:00
diff --git a/spark-cluster.ipynb b/spark-cluster.ipynb
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "ac3e8958-5d9c-4e80-9a6f-fd343a3d4dd5",
    "metadata": {
     "tags": []
@@ -89,7 +89,7 @@
    "id": "2ba80b72-4efc-4369-9acc-525613671e7b",
    "metadata": {},
    "source": [
-    "Predict average price, avocado dataset (how original). If you ggit cloned repo, is in /data, else go Kaggle"
+    "Predict average price, avocado dataset (how original). If you git cloned repo, is in /data, else go Kaggle"
    ]
   },
   {
@@ -117,12 +117,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "888a85f7-5e40-4e90-8a35-3cb1435d1460",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- _c0: integer (nullable = true)\n",
+      " |-- Date: timestamp (nullable = true)\n",
+      " |-- AveragePrice: double (nullable = true)\n",
+      " |-- Total Volume: double (nullable = true)\n",
+      " |-- 4046: double (nullable = true)\n",
+      " |-- 4225: double (nullable = true)\n",
+      " |-- 4770: double (nullable = true)\n",
+      " |-- Total Bags: double (nullable = true)\n",
+      " |-- Small Bags: double (nullable = true)\n",
+      " |-- Large Bags: double (nullable = true)\n",
+      " |-- XLarge Bags: double (nullable = true)\n",
+      " |-- type: string (nullable = true)\n",
+      " |-- year: integer (nullable = true)\n",
+      " |-- region: string (nullable = true)\n",
+      "\n",
+      "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n",
+      "|_c0|               Date|AveragePrice|Total Volume|   4046|     4225| 4770|Total Bags|Small Bags|Large Bags|XLarge Bags|        type|year|region|\n",
+      "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n",
+      "|  0|2015-12-27 00:00:00|        1.33|    64236.62|1036.74| 54454.85|48.16|   8696.87|   8603.62|     93.25|        0.0|conventional|2015|Albany|\n",
+      "|  1|2015-12-20 00:00:00|        1.35|    54876.98| 674.28| 44638.81|58.33|   9505.56|   9408.07|     97.49|        0.0|conventional|2015|Albany|\n",
+      "|  2|2015-12-13 00:00:00|        0.93|   118220.22|  794.7|109149.67|130.5|   8145.35|   8042.21|    103.14|        0.0|conventional|2015|Albany|\n",
+      "|  3|2015-12-06 00:00:00|        1.08|    78992.15| 1132.0| 71976.41|72.58|   5811.16|    5677.4|    133.76|        0.0|conventional|2015|Albany|\n",
+      "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n",
+      "only showing top 4 rows\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "# Cache table/dataframe for re-usable table with .cache()\n",
     "# caching operation takes place only when a Spark action (count, show, take or write) is also performed on the same dataframe\n",
@@ -148,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "e0068bc2-270c-4e43-beeb-082b404ce297",
    "metadata": {
     "tags": []
@@ -195,10 +228,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "ea5b4865-062b-491a-bf10-1242d46d358c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n",
+      "|AveragePrice|Medium Size|Large Size|XLarge Size|Small Bags|Large Bags|XLarge Bags|        type|year|region|Year Index|Month|\n",
+      "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n",
+      "|        1.33|    1036.74|  54454.85|      48.16|   8603.62|     93.25|        0.0|conventional|2015|Albany|        15|   12|\n",
+      "|        1.35|     674.28|  44638.81|      58.33|   9408.07|     97.49|        0.0|conventional|2015|Albany|        15|   12|\n",
+      "|        0.93|      794.7| 109149.67|      130.5|   8042.21|    103.14|        0.0|conventional|2015|Albany|        15|   12|\n",
+      "|        1.08|     1132.0|  71976.41|      72.58|    5677.4|    133.76|        0.0|conventional|2015|Albany|        15|   12|\n",
+      "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n",
+      "only showing top 4 rows\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "# convert 'year' yyyy to yy (yyyy - 2000, since we have 2015-2018 values)\n",
     "df = df.withColumn('Year Index', col('Year') - 2000)\n",
@@ -228,7 +278,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "382272ea-07aa-43a4-af0f-681b332af34d",
    "metadata": {},
    "outputs": [],
@@ -288,12 +338,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "ae2ebec7-8379-45bd-b375-faac5c64824c",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.18867989356762913"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from pyspark.ml.evaluation import RegressionEvaluator\n",
     "\n",
@@ -319,6 +380,14 @@
    "source": [
     "For reference, original article, using Linear regression + cv/gridSearch : rmse of .28"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "311fe041-88b7-4e7f-a495-46dcc846bc50",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {