|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 9, |
| 6 | + "metadata": {}, |
| 7 | + "outputs": [ |
| 8 | + { |
| 9 | + "name": "stdout", |
| 10 | + "output_type": "stream", |
| 11 | + "text": [ |
| 12 | + "[[1.000e+00 1.386e+01 1.350e+00 2.270e+00 1.600e+01 9.800e+01 2.980e+00\n", |
| 13 | + " 3.150e+00 2.200e-01 1.850e+00 7.220e+00 1.010e+00 3.550e+00 1.045e+03]\n", |
| 14 | + " [2.000e+00 1.184e+01 2.890e+00 2.230e+00 1.800e+01 1.120e+02 1.720e+00\n", |
| 15 | + " 1.320e+00 4.300e-01 9.500e-01 2.650e+00 9.600e-01 2.520e+00 5.000e+02]]\n" |
| 16 | + ] |
| 17 | + } |
| 18 | + ], |
| 19 | + "source": [ |
| 20 | + "# Write a kmeans plus plus algorithm to initialize the centroids to be used in the kmeans algorithm\n", |
| 21 | + "# Initialize centroids based on a probability index set by their distance from each other\n", |
| 22 | + "# Dataset is a numpy array and k is the number of centroids\n", |
| 23 | + "\n", |
| 24 | + "\n", |
| 25 | + "import numpy as np\n", |
| 26 | + "np.set_printoptions(precision=3)\n", |
| 27 | + "import matplotlib.pyplot as plt\n", |
| 28 | + "%matplotlib inline\n", |
| 29 | + "\n", |
| 30 | + "\n", |
| 31 | + "def kmeans_plus_plus(dataset, k):\n", |
| 32 | + " # define the shape of the dataset\n", |
| 33 | + " N, D = dataset.shape\n", |
| 34 | + " # define the centroid size\n", |
| 35 | + " C = np.zeros([k, D])\n", |
| 36 | + " # pick a random instance in the dataset\n", |
| 37 | + " random_index = np.random.choice(len(dataset))\n", |
| 38 | + " # choose a random instance\n", |
| 39 | + " random_instance = dataset[random_index]\n", |
| 40 | + " for i in range(k):\n", |
| 41 | + " # define the distance array to store all items in the dataset from the randomly chosen instance\n", |
| 42 | + " distance_array = np.linalg.norm(random_instance - dataset , axis = 1)\n", |
| 43 | + " # each distance proportionally increase the chances for the next data to be chosen\n", |
| 44 | + " prob_array = distance_array / np.sum(distance_array)\n", |
| 45 | + " # based on the distances, chose a random instance index\n", |
| 46 | + " chosen_index = np.random.choice(len(dataset) , p = prob_array)\n", |
| 47 | + " # assing the next instance to the centroid[i]\n", |
| 48 | + " C[i] = dataset[chosen_index]\n", |
| 49 | + " # last chosen C[i] will be the new reference to chose next instance for C[i+1]\n", |
| 50 | + " random_instance = C[i] \n", |
| 51 | + " return C\n", |
| 52 | + "\n", |
| 53 | + "# test the function \n", |
| 54 | + "if __name__ == '__main__':\n", |
| 55 | + " \n", |
| 56 | + " # download your copy of wine dataset from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine/\n", |
| 57 | + " # change the format to .txt or .cvs if current data is causing any parsing error.\n", |
| 58 | + " \n", |
| 59 | + " dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n", |
| 60 | + " dataset = np.array(dataset)\n", |
| 61 | + " print(kmeans_plus_plus(dataset, 2))\n", |
| 62 | + " \n", |
| 63 | + " # if there was a need for standardization, this code below could be applied:\n", |
| 64 | + " # standardized_data = [(i - np.mean(dataset)) / (np.std(dataset)) for i in dataset]\n", |
| 65 | + " " |
| 66 | + ] |
| 67 | + }, |
| 68 | + { |
| 69 | + "cell_type": "code", |
| 70 | + "execution_count": 10, |
| 71 | + "metadata": {}, |
| 72 | + "outputs": [ |
| 73 | + { |
| 74 | + "name": "stderr", |
| 75 | + "output_type": "stream", |
| 76 | + "text": [ |
| 77 | + "/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.\n", |
| 78 | + " out=out, **kwargs)\n", |
| 79 | + "/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars\n", |
| 80 | + " ret = ret.dtype.type(ret / rcount)\n" |
| 81 | + ] |
| 82 | + }, |
| 83 | + { |
| 84 | + "name": "stdout", |
| 85 | + "output_type": "stream", |
| 86 | + "text": [ |
| 87 | + "[ 0. 17592402.704 4545800.928 2633614.463 1341434.153\n", |
| 88 | + " 916424.194 684095.537 415042.347 674861.281 353365.048\n", |
| 89 | + " 657438.809 187113.672 492970.971 565012.338 138874.119]\n" |
| 90 | + ] |
| 91 | + }, |
| 92 | + { |
| 93 | + "data": { |
| 94 | + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEDCAYAAAA7jc+ZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de3Scd33n8fd3RneNrYs18k3yTQqBJNhOoubGKSS0ZA2leKG0tU8WKJe6oQm97G5bOOwJZ7enLafslqVASL0hNTlA0nJJyZaQhAJLUpK0kUPiS0ISR3Ys2U4k2ZJs3W/f/WMeyWNZl5E18qN55vM6Z45mnsvMVzr25/nN7/k9z8/cHRERia5Y2AWIiMjiUtCLiEScgl5EJOIU9CIiEaegFxGJOAW9iEjELdmgN7N7zKzdzA5ksO3nzezZ4PGSmXVfjBpFRHKBLdVx9Gb2VqAXuNfdr5jHfp8ArnT3jyxacSIiOWTJtujd/THgVPoyM2sws4fNbK+ZPW5mb5xm153AfRelSBGRHFAQdgHztBu41d1fNrNrgTuBt0+sNLP1wEbgxyHVJyKy5ORM0JtZArgB+JaZTSwunrLZDuDb7j52MWsTEVnKciboSXUzdbv71lm22QHcdpHqERHJCUu2j34qdz8NHDaz3wSwlC0T683sUqAKeDKkEkVElqQlG/Rmdh+p0L7UzNrM7KPALcBHzew54CCwPW2XncD9vlSHEYmIhGTJDq8UEZHsWLItehERyY4leTK2pqbGN2zYEHYZIiI5Y+/evZ3unpxu3ZIM+g0bNtDc3Bx2GSIiOcPMXp1pnbpuREQiTkEvIhJxCnoRkYhT0IuIRJyCXkQk4hT0IiIRN+fwSjO7B3g30D7dBCBm9iekbk0w8X5vApLufsrMjgBngDFg1N2bslW4iIhkJpMW/R5g20wr3f1z7r41uKvkp4Cfunv6hCE3BesXNeRHx8b58k8O8dhLHYv5MSIiOWfOoJ9upqdZhDa7Uzxm7H6shYcPvhbGx4uILFlZ66M3szJSLf/vpC124NFg6r9dc+y/y8yazay5o2P+rXIzoyFZzivtvfPeV0QkyrJ5MvbXgZ9N6bZ5i7tfBbwTuC2Y8Hta7r7b3ZvcvSmZnPZ2DXNqrE3wSoeCXkQkXTaDfgdTum3c/Xjwsx14ALgmi593nsbaBJ29w3T3Dy/mx4iI5JSsBL2ZVQBvA76XtqzczJZNPAduBg5k4/Nm0pBMAKhVLyKSJpPhlfcBNwI1ZtYGfAYoBHD3u4LN3gs86u59abuuBB4IJvIuAL7p7g9nr/TzNdamgv5Qey9Xr69ezI8SEckZcwa9u+/MYJs9pIZhpi9rAbZMt/1iqasqo6ggxisdfXNvLCKSJyJ1ZWw8ZmyqKeeQRt6IiEyKVNBDqp9effQiImdFL+hrE7Se6mdwZCzsUkREloTIBX1jbYJxh8Od6qcXEYEIBn1DshzQEEsRkQmRC/pNNQnM0AlZEZFA5IK+tCjO2spSDbEUEQlELugh1U+vFr2ISEokg74hmaClo5fxcQ+7FBGR0EUy6BtrEwyNjnOseyDsUkREQhfJoJ+4udkhjbwREYlm0E/c3EyTkIiIRDToq8uLqC4v0glZEREiGvSQunBKF02JiEQ46DXEUkQkJbJB35BM0NU/wqk+TSsoIvktukGfNtuUiEg+i2zQN2r+WBERIMJBv7aylJLCmFr0IpL35gx6M7vHzNrN7MAM6280sx4zezZ43JG2bpuZvWhmh8zsk9ksfC6xmLGpRidkRUQyadHvAbbNsc3j7r41ePwPADOLA18G3glcBuw0s8sWUux8NdRqWkERkTmD3t0fA05dwHtfAxxy9xZ3HwbuB7ZfwPtcsMZkgmPdAwwMa1pBEclf2eqjv97MnjOzH5jZ5cGytUBr2jZtwbKLpqG2HHdo6VSrXkTyVzaC/hlgvbtvAb4I/FOw3KbZdsb7BpvZLjNrNrPmjo6OLJR19p436qcXkXy24KB399Pu3hs8fwgoNLMaUi34+rRN64Djs7zPbndvcvemZDK50LIA2LCinJih2aZEJK8tOOjNbJWZWfD8muA9TwJPA5eY2UYzKwJ2AA8u9PPmo6QwTn11me5iKSJ5rWCuDczsPuBGoMbM2oDPAIUA7n4X8H7g42Y2CgwAO9zdgVEzux14BIgD97j7wUX5LWbRmNTIGxHJb3MGvbvvnGP9l4AvzbDuIeChCystOxpqEzx+qJOxcScem+60gYhItEX2ytgJjckEw6PjtJ7qD7sUEZFQRD7oG2rLAd3zRkTyV/SDPqkhliKS3yIf9JVlRdQkitSiF5G8Ffmgh1SrXi16EclX+RH0tQle6egjNepTRCS/5EXQNyYT9AyM0NmraQVFJP/kR9DXarYpEclfeRH0mj9WRPJZXgT96uUllBXFFfQikpfyIuhjMWNTslxdNyKSl/Ii6CG4uZla9CKSh/Im6BuSCY73DNI3NBp2KSIiF1XeBP3EyJsWTUIiInkm74Je/fQikm/yJujXrygnHjONvBGRvJM3QV9UEGN9dZmCXkTyTt4EPcAmTSsoInkor4K+sTbBkZN9jI6Nh12KiMhFk1dB35AsZ2TMOappBUUkj8wZ9GZ2j5m1m9mBGdbfYmb7gscTZrYlbd0RM9tvZs+aWXM2C78QjbrnjYjkoUxa9HuAbbOsPwy8zd03A38O7J6y/iZ33+ruTRdWYvY0TA6x1Fh6EckfBXNt4O6PmdmGWdY/kfbyKaBu4WUtjuUlhdQuK1aLXkTySrb76D8K/CDttQOPmtleM9s1245mtsvMms2suaOjI8tlndVYq5E3IpJfshb0ZnYTqaD/s7TFb3H3q4B3AreZ2Vtn2t/dd7t7k7s3JZPJbJV1nobg5maaVlBE8kVWgt7MNgN3A9vd/eTEcnc/HvxsBx4ArsnG5y1EY22CM0OjtJ8ZCrsUEZGLYsFBb2brgO8CH3D3l9KWl5vZsonnwM3AtCN3LqaGZHBCVv30IpIn5jwZa2b3ATcCNWbWBnwGKARw97uAO4AVwJ1mBjAajLBZCTwQLCsAvunuDy/C7zAvk0MsO3q5obEm5GpERBZfJqNuds6x/mPAx6ZZ3gJsOX+PcK1cXkyiuEAtehHJG3l1ZSyAmdGQLOeQRt6ISJ7Iu6CH1IVTr7TroikRyQ/5GfTJBK+dHuTM4EjYpYiILLq8DPpG3QpBRPJIXga9hliKSD7Jy6Bfv6KMgpjphKyI5IW8DPrCeIz1K8rUoheRvJCXQQ+pfnq16EUkH+R10B892c+IphUUkYjL26BvSCYYHXdePamRNyISbXkb9GenFVTQi0i05W3Qb5oYYql+ehGJuLwN+kRxAasrSjStoIhEXt4GPQSzTalFLyIRl9dB31iraQVFJPryOugbahP0DY/x2unBsEsREVk0+R30yXIA9dOLSKTlddBP3sVSQS8iEZbXQZ9MFLOspEC3QhCRSJsz6M3sHjNrN7MDM6w3M/tbMztkZvvM7Kq0ddvM7MVg3SezWXg2mFlwQlYXTYlIdGXSot8DbJtl/TuBS4LHLuArAGYWB74crL8M2Glmly2k2MXQkNTNzUQk2uYMend/DDg1yybbgXs95Smg0sxWA9cAh9y9xd2HgfuDbZeUxtoEHWeG6BnQtIIiEk3Z6KNfC7SmvW4Lls20fFpmtsvMms2suaOjIwtlZaZRt0IQkYjLRtDbNMt8luXTcvfd7t7k7k3JZDILZWWmYfLmZgp6EYmmgiy8RxtQn/a6DjgOFM2wfEmpryqlKB5Ti15EIisbLfoHgQ8Go2+uA3rc/QTwNHCJmW00syJgR7DtklIQj7GhRtMKikh0zdmiN7P7gBuBGjNrAz4DFAK4+13AQ8C7gENAP/DhYN2omd0OPALEgXvc/eAi/A4L1lib4IUTZ8IuQ0RkUcwZ9O6+c471Dtw2w7qHSB0IlrSGZIKHD7zG0OgYxQXxsMsREcmqvL4ydkJjbYJxhyOd/WGXIiKSdQp6Ui160BBLEYkmBT2wSXexFJEIU9ADZUUFrK0sVYteRCJJQR9oqE2oRS8ikaSgDzQmE7R09DE+rmkFRSRaFPSBhtpyBkbGON4zEHYpIiJZpaAPnL25me5NLyLRoqAP6OZmIhJVCvrAivIiKssKFfQiEjkK+oCZ0ZhMaIiliESOgj5NQzKhu1iKSOQo6NM01iY42TdMV99w2KWIiGSNgj5NQ23qVgjqvhGRKFHQp2lMLgMU9CISLQr6NGurSikqiGnkjYhEioI+TTxmbKopV9CLSKQo6KdorE3o6lgRiRQF/RQNyQStXf0MjoyFXYqISFZkFPRmts3MXjSzQ2b2yWnW/4mZPRs8DpjZmJlVB+uOmNn+YF1ztn+BbGusTeAOhzvVqheRaJgz6M0sDnwZeCdwGbDTzC5L38bdP+fuW919K/Ap4Kfufiptk5uC9U1ZrH1RTEwrqH56EYmKTFr01wCH3L3F3YeB+4Hts2y/E7gvG8WFYVOyHDMNsRSR6Mgk6NcCrWmv24Jl5zGzMmAb8J20xQ48amZ7zWzXhRZ6sZQUxqmrKlWLXkQioyCDbWyaZTNNw/TrwM+mdNu8xd2Pm1kt8EMz+4W7P3beh6QOArsA1q1bl0FZiyd1czP10YtINGTSom8D6tNe1wHHZ9h2B1O6bdz9ePCzHXiAVFfQedx9t7s3uXtTMpnMoKzF05BM0NLRy5imFRSRCMgk6J8GLjGzjWZWRCrMH5y6kZlVAG8Dvpe2rNzMlk08B24GDmSj8MXUWJtgaHScY12aVlBEct+cXTfuPmpmtwOPAHHgHnc/aGa3BuvvCjZ9L/Cou6f3eawEHjCzic/6prs/nM1fYDE01k5MK9jLuhVlIVcjIrIwmfTR4+4PAQ9NWXbXlNd7gD1TlrUAWxZUYQjSh1je9MbakKsREVkYXRk7jaryIlaUF2mIpYhEgoJ+Bg3JhIZYikgkKOhn0FCr+WNFJBoU9DNoSJbT1T/Cyd6hsEsREVkQBf0Mzo680YVTIpLbFPQzmAh69dOLSK5T0M9gTUUppYVxBb2I5DwF/QxiMWNTslwnZEUk5ynoZ6EhliISBQr6WTTWJjjWPcDAsKYVFJHcpaCfxcStENR9IyK5TEE/i/Sbm4mI5CoF/Sw21JQRM3hF/fQiksMU9LMoLoizrrqMQ2rRi0gOU9DPobE2wSvtujpWRHKXgn4ODckEhzv7GB0bD7sUEZELoqCfQ0NtguGxcdo0raCI5CgF/RzSZ5sSEclFCvo5NGosvYjkOAX9HCrKCqlJFKtFLyI5K6OgN7NtZvaimR0ys09Os/5GM+sxs2eDxx2Z7psLGmt1czMRyV0Fc21gZnHgy8A7gDbgaTN70N2fn7Lp4+7+7gvcd0lrrE3w4LPHcXfMLOxyRETmJZMW/TXAIXdvcfdh4H5ge4bvv5B9l4yGZILTg6N0aFpBEclBmQT9WqA17XVbsGyq683sOTP7gZldPs99MbNdZtZsZs0dHR0ZlHXxTN7zRhdOiUgOyiTop+ur8CmvnwHWu/sW4IvAP81j39RC993u3uTuTclkMoOyLp7JIZbqpxeRHJRJ0LcB9Wmv64Dj6Ru4+2l37w2ePwQUmllNJvvmgtUVJZQVxXVzMxHJSZkE/dPAJWa20cyKgB3Ag+kbmNkqC85Smtk1wfuezGTfXGBmNCQTGnkjIjlpzlE37j5qZrcDjwBx4B53P2hmtwbr7wLeD3zczEaBAWCHuzsw7b6L9LssqsbaBI+/3En/8ChlRXP+2URElgxL5fHS0tTU5M3NzWGXcY4f/+J1Pva1ZrbUV/L3v/NLVJYVhV2SiMgkM9vr7k3TrdOVsRl6+xtXcuctV3Pw2Gl++++e4vXTg2GXJCKSEQX9PGy7YhV7PvxLtHX18xtfeYIjnRpuKSJLn4J+nm5orOGbv3sdfUOjvP+uJ3n++OmwSxIRmZWC/gJsqa/kW7feQGHc+O3dT/L0kVNhlyQiMiMF/QVqrE3w7Y/fQHJZMR/46r/xk1+0h12SiMi0FPQLsLaylG/93vU01ib43Xub+d6zx8IuSUTkPAr6BVqRKOa+372Opg1V/NE/PMu9Tx4JuyQRkXMo6LNgWUkhez58Db/6ppXc8b2DfOFfXmYpXp8gIvlJQZ8lJYVxvnLLVbz/6jo+/y8v8d//7/OMjyvsRSR8upY/iwriMf76NzZTWVrI3f96mO7+YT73m1sojOt4KiLhUdBnWSxmfPrX3kRVeRGfe+RFTg+OcuctV1FSGA+7NBHJU2pqLgIz47abGvmL917BT15s54Nf/Xd6BkbCLktE8pSCfhHdcu16vrjzSn7e2sWO3U/RcUZTEYrIxaegX2Tv3ryGuz/0Sxzp7OM373qC1lP9YZckInlGQX8RvO0NSb7+sWvp6h/h/Xc9wUuvnwm7JBHJIwr6i+Tq9VX84+9djzv81t89yc+PdoVdkojkCQX9RXTpqmV85+M3UFFayC13/xuPv9wRdkkikgcU9BdZfXUZ37r1etavKOcje57m+/tOhF2SiEScgj4EtctKuH/XdWytr+T2+57hvn8/GnZJIhJhGQW9mW0zsxfN7JCZfXKa9beY2b7g8YSZbUlbd8TM9pvZs2a2tCaCDVFFaSH3fuRabnxDkk99dz93/r9Duj+OiCyKOYPezOLAl4F3ApcBO83ssimbHQbe5u6bgT8Hdk9Zf5O7b51p4tp8VVoUZ/cHm9i+dQ1//fCLfPzrz/Baj+aiFZHsyqRFfw1wyN1b3H0YuB/Ynr6Buz/h7hPDSJ4C6rJbZnQVxmN8/re28qfbLuUnL7bzq3/zU772xBHGdEM0EcmSTIJ+LdCa9rotWDaTjwI/SHvtwKNmttfMds20k5ntMrNmM2vu6Miv0SixmPH7Nzby6B+/lSvXVfKZBw/yvjt/xsHjPWGXJiIRkEnQ2zTLpm1umtlNpIL+z9IWv8XdryLV9XObmb11un3dfbe7N7l7UzKZzKCs6Fm/opx7P3INf7vzSo51D/CeL/2Mv/j+8/QNjYZdmojksEyCvg2oT3tdBxyfupGZbQbuBra7+8mJ5e5+PPjZDjxAqitIZmBmvGfLGn70n2/kt5rq+T+PH+bmzz/Gj154PezSRCRHZRL0TwOXmNlGMysCdgAPpm9gZuuA7wIfcPeX0paXm9myiefAzcCBbBUfZRVlhfzV+97Mt2+9nvLiOB/9WjMf//penawVkXmbM+jdfRS4HXgEeAH4R3c/aGa3mtmtwWZ3ACuAO6cMo1wJ/KuZPQf8O/B9d384679FhDVtqOafP/HL/Ml/uJQf/0Ina0Vk/mwpjt1uamry5mYNuZ/q1ZN9/Ld/OsDjL3eypa6Cv3zfm7l8TUXYZYnIEmBme2cawq4rY3PIxMnaL+zYes7J2v5hnawVkZkp6HOMmbF969pzTta+4290slZEZqagz1HpJ2vLilIna3//G3t5/bRO1orIuRT0Oa5pQzXf/4PUydofvdDOr/yvn3LvkzpZKyJnKegjoKggxm03nb2y9o7vHeR9X3lCV9aKCKCgj5RzTtZ29fOeL/2Mv3zoBZ2sFclzBWEXINk1cbL2xjfU8tmHX2D3Yy18f98Jbn97I1evr6IhmSAem+6uFiISVRpHH3FPHznFpx/Yz0uv9wJQVhTnijUVbK6r4M11FWypq2T9ijLMFP4iuWy2cfQK+jwwPu60dPayr60neHRz8PhphkbHAVheUsDmuko211UEj0pWV5Qo/EVyiIJezjMyNs5Lr59hf1sPz7X1sP9YN784cYbRYLROTaI4LfhT4V+TKA65ahGZyWxBrz76PFUYj3H5mgouX1PBjuB+ooMjY7xw4jT7j/XwXGuq5f+TF9uZaAusqShJtfzrK9i8tpI311VQUVoY3i8hIhlR0MukksI4V66r4sp1VXB9alnf0CgHjvWkwj/o9nn44GuT+2xYUcYVayuoqypjTWUJqytKWV1RwuqKEqrLi9T9I7IEKOhlVuXFBVy7aQXXbloxuay7f5j9x8729z/X1s0jB19jZOzcbsDiglgQ+kH4BweC9ANCRWmhDgYii0xBL/NWWVbEL1+S5JcvOTsT2Pi409k3xInuQU70DHA8+HmiZ5ATPYM81XKS188MnXfFbmlhnNWVJaypKGVVRQlrKkpYXZk6CKypTC1bXqLuIZGFUNBLVsRiRu2yEmqXlbClvnLabcbGnY4zQxzvGZg8IKQOBKkDw+Mvd9B+Zoip4wOWlxRQX11GfVUZ61aUUV9VSl11Geuqy1hbWUpJYfwi/IbTGx4d50TPAG1dA7R19dPWNUDrqdTPEz2D1CSKaEgmaKhN0JAspyGZYP2KcooKls61imPjzvHuAVo6+zjS2cfhzj5ePz3IG1Yu4+r1VWxdV6mDbY7TqBtZUkbGxmk/M8SJ7gGO9wxyonuAY90DHD3VPxmgE8NCJ6xcXsy64EBQV506EKyrLqO+uoyVy0sWdIHYyNg4J7oHJ0P87M8BWrv6ee304DkHpnjMWF1RQl1VKauWl9DZO0xLRy/H02YGi8eMddVlbKopP+cA0JBMUFVedMG1zsY9dZBND/OW4OfRk/0Mj539myaKC0guK+bVk32MO5jBpUHoTzzWVefOtRfuzvDYOIMj4wyNjDE4Ms7g6BiDE89HguejqedDI2MMjY6fu3409Xx4dJy6qlI211Wypb6CVcuXzjBkDa+UyBgfdzp6h2g91U9rVz9HT6YCtzU4EJyYEryFcaOuqoy6qlLqg28B9VVl1FenDgaJ4gJO9AxOBnd6mB/rGuBEzwDpvU0xg9UVpaytKk0dWKpKg0fq+eqKEgri57fW+4ZGOdzZxysdvbzS3ssrHannLZ19DKcduKrKCidDv6E2dQDYlExQX1U67ftO1dM/QktnL4eDQJ8I8yOdffQNj01uV1QQY311GRtrytmYLGdTTTkbVqSeJxPFmBm9Q6M8e7Sbva92sfdoFz9/tYszwUT1NYlirl5fORn8V6ytoLjg4n6zGh0bp61rgMPB7zjxONY9wMBwKpyHglC/0JiLWWqQQklhnJKCGAXxGMe7ByaHISeXFbN5bUXaaLQKVoQ0DFlBL3ljeHSc4xPfALr6aT01MHlQaD3VT1f/yKz7m8Hq5SWTwT0Z4tWpYF9VUUJhBoGbqYluk0PBAaCls2/yQNDZOzS5XWHc2LCi/JwDQHFBnMOdvRzu7A9+9p3z+8UM6qvLUgFeU86mZPnk8zWVpfP+pjM27rzcfiYV/K928cyrXRw52Q9AUTzGm+squHp9FVetS4V/ctnCA8/daT8zREvHRJD3Tn4baT3Vf84AgOUlBWxMJqirKqW8KH5OQBcXxikuiJ1dVhijpCDtedr64uB1SUGcwrid12IfHBnj+ROn2dfazb5gUMIrHb2TB5O1laVsqQ/Cf20FV9RVXJSuLwW9SODM4Egq/IPgPz04ytrKkrQWeemS6T/v6R/hlc6pB4BeXj3ZP9miBFi1vISNNeVsqEm1zCeer6suW/TfpePMEM8cTYX+3le72NfWM9kNtH5FGVevq+Kq9VU0bajiktplMx5c0r+JTHYrdfRx5GQf/WnfRIoLYqlvIWmPTclyNtYkqCoLbwTXmcERDhw7zf5j3ZPDkFtPDUyu35QsZ0tdJW9eW8GW+tT1K9k+t7TgoDezbcAXgDhwt7t/dsp6C9a/C+gHfsfdn8lk3+ko6EVmNjI2ztFT/QyNjLOhpoyyoqUzpmJodIwDx05PBn/zq12T30yWFRewdV2qu6cwHjunu+VU3/Dke8RjRn1VaRDkCTbWlKV+JstZvbyEWI7clO9UXzAMufVs+LefSf0t4jHjDSuXsSXtnlOXrlq2oG+LCwp6M4sDLwHvANqAp4Gd7v582jbvAj5BKuivBb7g7tdmsu90FPQi0eDutJ4aYO/RU6ngP9LFi6+fwT11En0izCe+iWxMllNftfjfRMLy+ulBnmvtZl9bD8+1dbP/WA/dQXdbUUGMLXUV/MOu6y/oYLbQWyBcAxxy95bgze4HtgPpYb0duNdTR42nzKzSzFYDGzLYV0QiysxYtyI1LPa9V9YB0Ds0ipG6GC/frFxews2Xr+Lmy1cBZw+Ez7V1s6+tm96h0UX5xpLJX3ot0Jr2uo1Uq32ubdZmuC8AZrYL2AWwbt26DMoSkVyUyMOAn0n6gfDXt6xZtM/J5PvRdIeXqf09M22Tyb6phe673b3J3ZuSyeR0m4iIyAXI5NDaBtSnva4Djme4TVEG+4qIyCLKpEX/NHCJmW00syJgB/DglG0eBD5oKdcBPe5+IsN9RURkEc3Zonf3UTO7HXiE1BDJe9z9oJndGqy/C3iI1IibQ6SGV354tn0X5TcREZFp6YIpEZEImG14ZTQHq4qIyCQFvYhIxCnoRUQibkn20ZtZB/Bq2HVMowboDLuIC6Taw6HaL75crRsWVvt6d5/2IqQlGfRLlZk1z3SyY6lT7eFQ7RdfrtYNi1e7um5ERCJOQS8iEnEK+vnZHXYBC6Daw6HaL75crRsWqXb10YuIRJxa9CIiEaegFxGJOAV9Bsys3sx+YmYvmNlBM/vDsGuaDzOLm9nPzeyfw65lPoKZyr5tZr8I/vbXh11Tpszsj4N/KwfM7D4zKwm7ppmY2T1m1m5mB9KWVZvZD83s5eBnVZg1zmSG2j8X/JvZZ2YPmFllmDXOZLra09b9VzNzM6vJxmcp6DMzCvwXd38TcB1wm5ldFnJN8/GHwAthF3EBvgA87O5vBLaQI7+Dma0F/gBocvcrSN25dUe4Vc1qD7BtyrJPAj9y90uAHwWvl6I9nF/7D4Er3H0zqTmrP3Wxi8rQHs6vHTOrJzXP9tFsfZCCPgPufsLdnwmenyEVOGvDrSozZlYH/Bpwd9i1zIeZLQfeCnwVwN2H3b073KrmpQAoNbMCoIwlPOGOuz8GnJqyeDvwteD514D/eFGLytB0tbv7o+4+Grx8itSER0vODH93gM8Df8oMs/FdCAX9PJnZBuBK4N/CrSRj/5vUP5rxsAuZp01AB/D3QbfT3WZWHnZRmXD3Y8D/JNUiO0FqIp5Hw61q3lYGkwcR/KwNuZ4L9RHgB2EXkSkzew9wzN2fy+b7KujnwZBG3zkAAAG1SURBVMwSwHeAP3L302HXMxczezfQ7u57w67lAhQAVwFfcfcrgT6WbvfBOYL+7O3ARmANUG5m/yncqvKPmX2aVLfrN8KuJRNmVgZ8Grgj2++toM+QmRWSCvlvuPt3w64nQ28B3mNmR4D7gbeb2dfDLSljbUCbu098c/o2qeDPBb8KHHb3DncfAb4L3BByTfP1upmtBgh+todcz7yY2YeAdwO3eO5cLNRAqnHwXPB/tg54xsxWLfSNFfQZMDMj1Vf8grv/Tdj1ZMrdP+Xude6+gdTJwB+7e060LN39NaDVzC4NFv0K8HyIJc3HUeA6MysL/u38CjlyIjnNg8CHgucfAr4XYi3zYmbbgD8D3uPu/WHXkyl33+/ute6+Ifg/2wZcFfxfWBAFfWbeAnyAVIv42eDxrrCLygOfAL5hZvuArcBfhlxPRoJvId8GngH2k/p/tmQvyzez+4AngUvNrM3MPgp8FniHmb1MagTIZ8OscSYz1P4lYBnww+D/6l2hFjmDGWpfnM/KnW81IiJyIdSiFxGJOAW9iEjEKehFRCJOQS8iEnEKehGRiFPQi4hEnIJeRCTi/j8a85GdhDD61wAAAABJRU5ErkJggg==\n", |
| 95 | + "text/plain": [ |
| 96 | + "<Figure size 432x288 with 1 Axes>" |
| 97 | + ] |
| 98 | + }, |
| 99 | + "metadata": { |
| 100 | + "needs_background": "light" |
| 101 | + }, |
| 102 | + "output_type": "display_data" |
| 103 | + } |
| 104 | + ], |
| 105 | + "source": [ |
| 106 | + "# the Within-Cluster-Sum-of-Squares amount normally goes down as the number of clusters increase\n", |
| 107 | + "# the ideal number of clusters can be chosen if the WCSS graph achieves a clear elbow shape\n", |
| 108 | + "\n", |
| 109 | + "def k_means(dataset, k): # return the WCSS value for chosen k clusters\n", |
| 110 | + " # Initialize the WCSS value to 0\n", |
| 111 | + " WCSS = 0\n", |
| 112 | + " # create an array to collect the indices of the groups\n", |
| 113 | + " final_array = np.zeros(len(dataset))\n", |
| 114 | + " # initialize the centroids based on the kmeansplusplus function. If data is too big, just choose k random instances\n", |
| 115 | + " C = kmeans_plus_plus(dataset, k)\n", |
| 116 | + " # create an old C initialized with zeros\n", |
| 117 | + " C_old = np.zeros(C.shape)\n", |
| 118 | + " # define the error function\n", |
| 119 | + " parameter_gradient = np.linalg.norm(C-C_old)\n", |
| 120 | + " # count the number of epochs performed until the error is zero, put an iterator\n", |
| 121 | + " epoch = 0\n", |
| 122 | + " # continue to assign new C until the distance btw C_new and C_old is very small\n", |
| 123 | + " while parameter_gradient > 1e-4: \n", |
| 124 | + " # loop over the dataset to measure the instance distances with C\n", |
| 125 | + " # assign the group number for each instance based on the shortest distance from the instance to the C\n", |
| 126 | + " for i in range(len(dataset)):\n", |
| 127 | + " distance = np.linalg.norm((dataset[i] - C), axis = 1) # axis 1 will output distance array with size k\n", |
| 128 | + " centroid_index = np.argmin(distance) # the closest centroid_index from the instance \n", |
| 129 | + " final_array[i] = centroid_index # assign the centroid index as group number to an array \n", |
| 130 | + " # assign the current C as C_old to update the new C\n", |
| 131 | + " C_old = np.copy(C)\n", |
| 132 | + " # loop over each index of the centroid\n", |
| 133 | + " for i in range(k):\n", |
| 134 | + " # list of items with the same group numbers are assigned to sub_groups\n", |
| 135 | + " sub_group = [dataset[j] for j in range(len(dataset)) if final_array[j] == i]\n", |
| 136 | + " # take the mean of the sub-group and \n", |
| 137 | + " mean_of_sub_group = np.mean(sub_group, axis = 0)\n", |
| 138 | + " # assign the new values to C\n", |
| 139 | + " C[i] = mean_of_sub_group\n", |
| 140 | + " # update the parameter gradient after assigning the new C \n", |
| 141 | + " parameter_gradient = np.linalg.norm(C - C_old) # when C does not change, we reach to zero error \n", |
| 142 | + " # update epoch value\n", |
| 143 | + " epoch += 1\n", |
| 144 | + "\n", |
| 145 | + " # To measure WCSS, sum up the square of the \"distances from all group data to their mean\"\n", |
| 146 | + " for j in range(k):\n", |
| 147 | + " grouped_data = [dataset[i] for i in range(len(dataset)) if final_array[i] == j]\n", |
| 148 | + " mean_of_grouped_data = np.mean(grouped_data, axis = 0) # outputs number of attributes\n", |
| 149 | + " for i in range(len(grouped_data)):\n", |
| 150 | + " WCSS += np.sum((grouped_data[i] - mean_of_grouped_data)**2) \n", |
| 151 | + " \n", |
| 152 | + " \n", |
| 153 | + " # return the final cluster coordinates, the array carrying indices of all data points, and the WCSS with given k\n", |
| 154 | + " return epoch, C, final_array, WCSS\n", |
| 155 | + "\n", |
| 156 | + "\n", |
| 157 | + "if __name__ == '__main__':\n", |
| 158 | + " dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n", |
| 159 | + " dataset = np.array(dataset)\n", |
| 160 | + " np.random.seed(1)\n", |
| 161 | + " k = 15\n", |
| 162 | + " def run_k_means():\n", |
| 163 | + " WCSS_list = np.zeros(k)\n", |
| 164 | + " for i in range(1, k):\n", |
| 165 | + " epoch, C, final_array, WCSS = k_means(dataset, i)\n", |
| 166 | + " WCSS_list[i] = WCSS\n", |
| 167 | + "\n", |
| 168 | + " return WCSS_list\n", |
| 169 | + "\n", |
| 170 | + " result = run_k_means()\n", |
| 171 | + " print(result)\n", |
| 172 | + " plt.plot(np.arange(1, k), result[1:])\n", |
| 173 | + " " |
| 174 | + ] |
| 175 | + }, |
| 176 | + { |
| 177 | + "cell_type": "code", |
| 178 | + "execution_count": null, |
| 179 | + "metadata": {}, |
| 180 | + "outputs": [], |
| 181 | + "source": [] |
| 182 | + } |
| 183 | + ], |
| 184 | + "metadata": { |
| 185 | + "kernelspec": { |
| 186 | + "display_name": "Python 3", |
| 187 | + "language": "python", |
| 188 | + "name": "python3" |
| 189 | + }, |
| 190 | + "language_info": { |
| 191 | + "codemirror_mode": { |
| 192 | + "name": "ipython", |
| 193 | + "version": 3 |
| 194 | + }, |
| 195 | + "file_extension": ".py", |
| 196 | + "mimetype": "text/x-python", |
| 197 | + "name": "python", |
| 198 | + "nbconvert_exporter": "python", |
| 199 | + "pygments_lexer": "ipython3", |
| 200 | + "version": "3.7.7" |
| 201 | + } |
| 202 | + }, |
| 203 | + "nbformat": 4, |
| 204 | + "nbformat_minor": 2 |
| 205 | +} |
0 commit comments